src/runtime.cc - Issue 4189001: Faster ascii string case conversion.

Side by Side Diff: src/runtime.cc

Issue 4189001: Faster ascii string case conversion. (Closed)

Patch Set: Fix typo Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.	1 // Copyright 2006-2009 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 4618 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4629 // we simple return the result and let the converted string	4629 // we simple return the result and let the converted string

4630 // become garbage; there is no reason to keep two identical strings	4630 // become garbage; there is no reason to keep two identical strings

4631 // alive.	4631 // alive.

4632 return s;	4632 return s;

4633 }	4633 }

4634 }	4634 }

4635	4635

4636	4636

4637 namespace {	4637 namespace {

4638	4638

	4639 static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;

	4640

	4641

	4642 // Given a word and two range boundaries returns a word with high bit

	4643 // set in every byte iff the corresponding input byte was strictly in

	4644 // the range (m, n). All the other bits in the result are cleared.
	antonm 2010/10/26 17:17:33 maybe follow more standard [;) intervals maybe follow more standard [;) intervals Vitaly Repeshko 2010/10/26 18:14:48 I want to make usages of m and n similar, so eithe Show quoted text On 2010/10/26 17:17:33, antonm wrote: > maybe follow more standard [;) intervals I want to make usages of m and n similar, so either (;) or [;].
	4645 // This function is only useful when it can be inlined and the

	4646 // boundaries are statically known.

	4647 // Requires: the input word and the boundaries must be ascii.
	antonm 2010/10/26 17:17:33 I think ascii requires additional clarification. I think ascii requires additional clarification. And maybe it is better to formulate it as a restriction on the range of values Vitaly Repeshko 2010/10/26 18:14:48 Agreed. Extended the comment. Show quoted text On 2010/10/26 17:17:33, antonm wrote: > I think ascii requires additional clarification. And maybe it is better to > formulate it as a restriction on the range of values Agreed. Extended the comment.
	4648 static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {

	4649 // Every byte in an ascii string is less than or equal to 0x7F.

	4650 ASSERT((w & (kOneInEveryByte * 0x7F)) == w);

	4651 ASSERT(0 < m && m < n && n < 0x7F);

	4652 // Has high bit set in every w byte less than n.

	4653 uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;

	4654 // Has high bit set in every w byte greater than m.

	4655 uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);

	4656 return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));

	4657 }

	4658

	4659

	4660 enum AsciiCaseConversion {

	4661 ASCII_TO_LOWER,

	4662 ASCII_TO_UPPER

	4663 };

	4664

	4665

	4666 template <AsciiCaseConversion dir>

	4667 struct FastAsciiConverter {

	4668 static bool Convert(char* dst, char* src, int length) {

	4669 #ifdef DEBUG

	4670 char* saved_dst = dst;

	4671 char* saved_src = src;

	4672 #endif

	4673 // We rely on the distance between lower and upper case letters

	4674 // being a power of 2.

	4675 ASSERT('a' - 'A' == (1 << 5));

	4676 // Boundaries for the range of input characters than require conversion.

	4677 const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;

	4678 const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;

	4679 bool changed = false;

	4680 char* const limit = src + length;

	4681 #ifdef V8_HOST_CAN_READ_UNALIGNED

	4682 // Process the prefix of the input that requires no conversion one

	4683 // (machine) word at a time.

	4684 while (src <= limit - sizeof(uintptr_t)) {
	antonm 2010/10/26 17:17:33 Maybe convert src to uintptr_t? That would allow Maybe convert src to uintptr_t? That would allow to get rid of the casts and sizeof's. Vitaly Repeshko 2010/10/26 18:14:48 Ahh, I'm really afraid of the compiler emitting di Show quoted text On 2010/10/26 17:17:33, antonm wrote: > Maybe convert src to uintptr_t*? That would allow to get rid of the casts and > sizeof's. Ahh, I'm really afraid of the compiler emitting different code.
	4685 uintptr_t w = reinterpret_cast<uintptr_t>(src);

	4686 if (AsciiRangeMask(w, lo, hi) != 0) {

	4687 changed = true;

	4688 break;

	4689 }

	4690 reinterpret_cast<uintptr_t>(dst) = w;

	4691 src += sizeof(uintptr_t);

	4692 dst += sizeof(uintptr_t);

	4693 }

	4694 // Process the remainder of the input performing conversion when

	4695 // required one word at a time.

	4696 while (src <= limit - sizeof(uintptr_t)) {

	4697 uintptr_t w = reinterpret_cast<uintptr_t>(src);

	4698 uintptr_t m = AsciiRangeMask(w, lo, hi);

	4699 // The mask has high (7th) bit set in every byte that needs

	4700 // conversion and we know that the distance between cases is

	4701 // 1 << 5.

	4702 reinterpret_cast<uintptr_t>(dst) = w ^ (m >> 2);

	4703 src += sizeof(uintptr_t);

	4704 dst += sizeof(uintptr_t);

	4705 }

	4706 #endif

	4707 // Process the last few bytes of the input (or the whole input if

	4708 // unaligned accesses are not supported).

	4709 while (src < limit) {

	4710 char c = *src;

	4711 if (lo < c && c < hi) {

	4712 c ^= (1 << 5);

	4713 changed = true;

	4714 }

	4715 *dst = c;

	4716 ++src;

	4717 ++dst;

	4718 }

	4719 #ifdef DEBUG

	4720 CheckConvert(saved_dst, saved_src, length, changed);

	4721 #endif

	4722 return changed;

	4723 }

	4724

	4725 #ifdef DEBUG

	4726 static void CheckConvert(char* dst, char* src, int length, bool changed) {

	4727 bool expected_changed = false;

	4728 for (int i = 0; i < length; i++) {

	4729 if (dst[i] == src[i]) continue;

	4730 expected_changed = true;

	4731 if (dir == ASCII_TO_LOWER) {

	4732 ASSERT('A' <= src[i] && src[i] <= 'Z');

	4733 ASSERT(dst[i] == src[i] + ('a' - 'A'));

	4734 } else {

	4735 ASSERT(dir == ASCII_TO_UPPER);

	4736 ASSERT('a' <= src[i] && src[i] <= 'z');

	4737 ASSERT(dst[i] == src[i] - ('a' - 'A'));

	4738 }

	4739 }

	4740 ASSERT(expected_changed == changed);

	4741 }

	4742 #endif

	4743 };

	4744

	4745

4639 struct ToLowerTraits {	4746 struct ToLowerTraits {

4640 typedef unibrow::ToLowercase UnibrowConverter;	4747 typedef unibrow::ToLowercase UnibrowConverter;

4641	4748

4642 static bool ConvertAscii(char* dst, char* src, int length) {	4749 typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;

4643 bool changed = false;

4644 for (int i = 0; i < length; ++i) {

4645 char c = src[i];

4646 if ('A' <= c && c <= 'Z') {

4647 c += ('a' - 'A');

4648 changed = true;

4649 }

4650 dst[i] = c;

4651 }

4652 return changed;

4653 }

4654 };	4750 };

4655	4751

4656	4752

4657 struct ToUpperTraits {	4753 struct ToUpperTraits {

4658 typedef unibrow::ToUppercase UnibrowConverter;	4754 typedef unibrow::ToUppercase UnibrowConverter;

4659	4755

4660 static bool ConvertAscii(char* dst, char* src, int length) {	4756 typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;

4661 bool changed = false;

4662 for (int i = 0; i < length; ++i) {

4663 char c = src[i];

4664 if ('a' <= c && c <= 'z') {

4665 c -= ('a' - 'A');

4666 changed = true;

4667 }

4668 dst[i] = c;

4669 }

4670 return changed;

4671 }

4672 };	4757 };

4673	4758

4674 } // namespace	4759 } // namespace

4675	4760

4676	4761

4677 template <typename ConvertTraits>	4762 template <typename ConvertTraits>

4678 static Object* ConvertCase(	4763 static Object* ConvertCase(

4679 Arguments args,	4764 Arguments args,

4680 unibrow::Mapping<typename ConvertTraits::UnibrowConverter, 128>* mapping) {	4765 unibrow::Mapping<typename ConvertTraits::UnibrowConverter, 128>* mapping) {

4681 NoHandleAllocation ha;	4766 NoHandleAllocation ha;

4682 CONVERT_CHECKED(String, s, args[0]);	4767 CONVERT_CHECKED(String, s, args[0]);

4683 s = s->TryFlattenGetString();	4768 s = s->TryFlattenGetString();

4684	4769

4685 const int length = s->length();	4770 const int length = s->length();

4686 // Assume that the string is not empty; we need this assumption later	4771 // Assume that the string is not empty; we need this assumption later

4687 if (length == 0) return s;	4772 if (length == 0) return s;

4688	4773

4689 // Simpler handling of ascii strings.	4774 // Simpler handling of ascii strings.

4690 //	4775 //

4691 // NOTE: This assumes that the upper/lower case of an ascii	4776 // NOTE: This assumes that the upper/lower case of an ascii

4692 // character is also ascii. This is currently the case, but it	4777 // character is also ascii. This is currently the case, but it

4693 // might break in the future if we implement more context and locale	4778 // might break in the future if we implement more context and locale

4694 // dependent upper/lower conversions.	4779 // dependent upper/lower conversions.

4695 if (s->IsSeqAsciiString()) {	4780 if (s->IsSeqAsciiString()) {

4696 Object* o = Heap::AllocateRawAsciiString(length);	4781 Object* o = Heap::AllocateRawAsciiString(length);

4697 if (o->IsFailure()) return o;	4782 if (o->IsFailure()) return o;

4698 SeqAsciiString* result = SeqAsciiString::cast(o);	4783 SeqAsciiString* result = SeqAsciiString::cast(o);

4699 bool has_changed_character = ConvertTraits::ConvertAscii(	4784 bool has_changed_character = ConvertTraits::AsciiConverter::Convert(

4700 result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);	4785 result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);

4701 return has_changed_character ? result : s;	4786 return has_changed_character ? result : s;

4702 }	4787 }

4703	4788

4704 Object* answer = ConvertCaseHelper(s, length, length, mapping);	4789 Object* answer = ConvertCaseHelper(s, length, length, mapping);

4705 if (answer->IsSmi()) {	4790 if (answer->IsSmi()) {

4706 // Retry with correct length.	4791 // Retry with correct length.

4707 answer = ConvertCaseHelper(s, Smi::cast(answer)->value(), length, mapping);	4792 answer = ConvertCaseHelper(s, Smi::cast(answer)->value(), length, mapping);

4708 }	4793 }

4709 return answer; // This may be a failure.	4794 return answer; // This may be a failure.

(...skipping 5440 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
10150 } else {	10235 } else {

10151 // Handle last resort GC and make sure to allow future allocations	10236 // Handle last resort GC and make sure to allow future allocations

10152 // to grow the heap without causing GCs (if possible).	10237 // to grow the heap without causing GCs (if possible).

10153 Counters::gc_last_resort_from_js.Increment();	10238 Counters::gc_last_resort_from_js.Increment();

10154 Heap::CollectAllGarbage(false);	10239 Heap::CollectAllGarbage(false);

10155 }	10240 }

10156 }	10241 }

10157	10242

10158	10243

10159 } } // namespace v8::internal	10244 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/globals.h ('k') | test/mjsunit/string-case.js » ('j') | test/mjsunit/string-case.js » ('J')