src/runtime.cc - Issue 4189001: Faster ascii string case conversion.

Unified Diff: src/runtime.cc

Issue 4189001: Faster ascii string case conversion. (Closed)

Patch Set: Fix typo Created 10 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/runtime.cc

diff --git a/src/runtime.cc b/src/runtime.cc

index eaa27dcf83cf62060daa7392d78a8a878578e7cd..73f8fb53b7480beff39b9f4327e1a55213b061b8 100644

--- a/src/runtime.cc

+++ b/src/runtime.cc

@@ -4636,39 +4636,124 @@ static Object* ConvertCaseHelper(String* s,

namespace {

-struct ToLowerTraits {

- typedef unibrow::ToLowercase UnibrowConverter;

+static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;

+// Given a word and two range boundaries returns a word with high bit

+// set in every byte iff the corresponding input byte was strictly in

+// the range (m, n). All the other bits in the result are cleared.

antonm 2010/10/26 17:17:33 maybe follow more standard [;) intervals

Vitaly Repeshko 2010/10/26 18:14:48 I want to make usages of m and n similar, so eithe

+// This function is only useful when it can be inlined and the

+// boundaries are statically known.

+// Requires: the input word and the boundaries must be ascii.

antonm 2010/10/26 17:17:33 I think ascii requires additional clarification.

Vitaly Repeshko 2010/10/26 18:14:48 Agreed. Extended the comment.

+static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {

+ // Every byte in an ascii string is less than or equal to 0x7F.

+ ASSERT((w & (kOneInEveryByte * 0x7F)) == w);

+ ASSERT(0 < m && m < n && n < 0x7F);

+ // Has high bit set in every w byte less than n.

+ uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;

+ // Has high bit set in every w byte greater than m.

+ uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);

+ return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));

+enum AsciiCaseConversion {

+ ASCII_TO_LOWER,

+ ASCII_TO_UPPER

+};

- static bool ConvertAscii(char* dst, char* src, int length) {

+template <AsciiCaseConversion dir>

+struct FastAsciiConverter {

+ static bool Convert(char* dst, char* src, int length) {

+#ifdef DEBUG

+ char* saved_dst = dst;

+ char* saved_src = src;

+#endif

+ // We rely on the distance between lower and upper case letters

+ // being a power of 2.

+ ASSERT('a' - 'A' == (1 << 5));

+ // Boundaries for the range of input characters than require conversion.

+ const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;

+ const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;

bool changed = false;

- for (int i = 0; i < length; ++i) {

- char c = src[i];

- if ('A' <= c && c <= 'Z') {

- c += ('a' - 'A');

+ char* const limit = src + length;

+#ifdef V8_HOST_CAN_READ_UNALIGNED

+ // Process the prefix of the input that requires no conversion one

+ // (machine) word at a time.

+ while (src <= limit - sizeof(uintptr_t)) {

antonm 2010/10/26 17:17:33 Maybe convert src to uintptr_t*? That would allow

Vitaly Repeshko 2010/10/26 18:14:48 Ahh, I'm really afraid of the compiler emitting di

+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

+ if (AsciiRangeMask(w, lo, hi) != 0) {

+ changed = true;

+ break;

+ }

+ *reinterpret_cast<uintptr_t*>(dst) = w;

+ src += sizeof(uintptr_t);

+ dst += sizeof(uintptr_t);

+ }

+ // Process the remainder of the input performing conversion when

+ // required one word at a time.

+ while (src <= limit - sizeof(uintptr_t)) {

+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

+ uintptr_t m = AsciiRangeMask(w, lo, hi);

+ // The mask has high (7th) bit set in every byte that needs

+ // conversion and we know that the distance between cases is

+ // 1 << 5.

+ *reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);

+ src += sizeof(uintptr_t);

+ dst += sizeof(uintptr_t);

+ }

+#endif

+ // Process the last few bytes of the input (or the whole input if

+ // unaligned accesses are not supported).

+ while (src < limit) {

+ char c = *src;

+ if (lo < c && c < hi) {

+ c ^= (1 << 5);

changed = true;

}

- dst[i] = c;

+ *dst = c;

+ ++src;

+ ++dst;

}

+#ifdef DEBUG

+ CheckConvert(saved_dst, saved_src, length, changed);

+#endif

return changed;

}

+#ifdef DEBUG

+ static void CheckConvert(char* dst, char* src, int length, bool changed) {

+ bool expected_changed = false;

+ for (int i = 0; i < length; i++) {

+ if (dst[i] == src[i]) continue;

+ expected_changed = true;

+ if (dir == ASCII_TO_LOWER) {

+ ASSERT('A' <= src[i] && src[i] <= 'Z');

+ ASSERT(dst[i] == src[i] + ('a' - 'A'));

+ } else {

+ ASSERT(dir == ASCII_TO_UPPER);

+ ASSERT('a' <= src[i] && src[i] <= 'z');

+ ASSERT(dst[i] == src[i] - ('a' - 'A'));

+ }

+ ASSERT(expected_changed == changed);

+ }

+#endif

+};

+struct ToLowerTraits {

+ typedef unibrow::ToLowercase UnibrowConverter;

+ typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;

};

struct ToUpperTraits {

typedef unibrow::ToUppercase UnibrowConverter;

- static bool ConvertAscii(char* dst, char* src, int length) {

- bool changed = false;

- for (int i = 0; i < length; ++i) {

- char c = src[i];

- if ('a' <= c && c <= 'z') {

- c -= ('a' - 'A');

- changed = true;

- }

- dst[i] = c;

- }

- return changed;

- }

+ typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;

};

} // namespace

@@ -4696,7 +4781,7 @@ static Object* ConvertCase(

Object* o = Heap::AllocateRawAsciiString(length);

if (o->IsFailure()) return o;

SeqAsciiString* result = SeqAsciiString::cast(o);

- bool has_changed_character = ConvertTraits::ConvertAscii(

+ bool has_changed_character = ConvertTraits::AsciiConverter::Convert(

result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);

return has_changed_character ? result : s;

}

« no previous file with comments | « src/globals.h ('k') | test/mjsunit/string-case.js » ('j') | test/mjsunit/string-case.js » ('J')