Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(152)

Unified Diff: src/runtime.cc

Issue 4189001: Faster ascii string case conversion. (Closed)
Patch Set: Fix typo Created 10 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index eaa27dcf83cf62060daa7392d78a8a878578e7cd..73f8fb53b7480beff39b9f4327e1a55213b061b8 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -4636,39 +4636,124 @@ static Object* ConvertCaseHelper(String* s,
namespace {
-struct ToLowerTraits {
- typedef unibrow::ToLowercase UnibrowConverter;
+static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
+
+
+// Given a word and two range boundaries returns a word with high bit
+// set in every byte iff the corresponding input byte was strictly in
+// the range (m, n). All the other bits in the result are cleared.
antonm 2010/10/26 17:17:33 maybe follow more standard [;) intervals
Vitaly Repeshko 2010/10/26 18:14:48 I want to make usages of m and n similar, so eithe
+// This function is only useful when it can be inlined and the
+// boundaries are statically known.
+// Requires: the input word and the boundaries must be ascii.
antonm 2010/10/26 17:17:33 I think ascii requires additional clarification.
Vitaly Repeshko 2010/10/26 18:14:48 Agreed. Extended the comment.
+static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
+ // Every byte in an ascii string is less than or equal to 0x7F.
+ ASSERT((w & (kOneInEveryByte * 0x7F)) == w);
+ ASSERT(0 < m && m < n && n < 0x7F);
+ // Has high bit set in every w byte less than n.
+ uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
+ // Has high bit set in every w byte greater than m.
+ uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
+ return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
+}
+
+
+enum AsciiCaseConversion {
+ ASCII_TO_LOWER,
+ ASCII_TO_UPPER
+};
- static bool ConvertAscii(char* dst, char* src, int length) {
+
+template <AsciiCaseConversion dir>
+struct FastAsciiConverter {
+ static bool Convert(char* dst, char* src, int length) {
+#ifdef DEBUG
+ char* saved_dst = dst;
+ char* saved_src = src;
+#endif
+ // We rely on the distance between lower and upper case letters
+ // being a power of 2.
+ ASSERT('a' - 'A' == (1 << 5));
+ // Boundaries for the range of input characters than require conversion.
+ const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;
+ const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;
bool changed = false;
- for (int i = 0; i < length; ++i) {
- char c = src[i];
- if ('A' <= c && c <= 'Z') {
- c += ('a' - 'A');
+ char* const limit = src + length;
+#ifdef V8_HOST_CAN_READ_UNALIGNED
+ // Process the prefix of the input that requires no conversion one
+ // (machine) word at a time.
+ while (src <= limit - sizeof(uintptr_t)) {
antonm 2010/10/26 17:17:33 Maybe convert src to uintptr_t*? That would allow
Vitaly Repeshko 2010/10/26 18:14:48 Ahh, I'm really afraid of the compiler emitting di
+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
+ if (AsciiRangeMask(w, lo, hi) != 0) {
+ changed = true;
+ break;
+ }
+ *reinterpret_cast<uintptr_t*>(dst) = w;
+ src += sizeof(uintptr_t);
+ dst += sizeof(uintptr_t);
+ }
+ // Process the remainder of the input performing conversion when
+ // required one word at a time.
+ while (src <= limit - sizeof(uintptr_t)) {
+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
+ uintptr_t m = AsciiRangeMask(w, lo, hi);
+ // The mask has high (7th) bit set in every byte that needs
+ // conversion and we know that the distance between cases is
+ // 1 << 5.
+ *reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
+ src += sizeof(uintptr_t);
+ dst += sizeof(uintptr_t);
+ }
+#endif
+ // Process the last few bytes of the input (or the whole input if
+ // unaligned accesses are not supported).
+ while (src < limit) {
+ char c = *src;
+ if (lo < c && c < hi) {
+ c ^= (1 << 5);
changed = true;
}
- dst[i] = c;
+ *dst = c;
+ ++src;
+ ++dst;
}
+#ifdef DEBUG
+ CheckConvert(saved_dst, saved_src, length, changed);
+#endif
return changed;
}
+
+#ifdef DEBUG
+ static void CheckConvert(char* dst, char* src, int length, bool changed) {
+ bool expected_changed = false;
+ for (int i = 0; i < length; i++) {
+ if (dst[i] == src[i]) continue;
+ expected_changed = true;
+ if (dir == ASCII_TO_LOWER) {
+ ASSERT('A' <= src[i] && src[i] <= 'Z');
+ ASSERT(dst[i] == src[i] + ('a' - 'A'));
+ } else {
+ ASSERT(dir == ASCII_TO_UPPER);
+ ASSERT('a' <= src[i] && src[i] <= 'z');
+ ASSERT(dst[i] == src[i] - ('a' - 'A'));
+ }
+ }
+ ASSERT(expected_changed == changed);
+ }
+#endif
+};
+
+
+struct ToLowerTraits {
+ typedef unibrow::ToLowercase UnibrowConverter;
+
+ typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;
};
struct ToUpperTraits {
typedef unibrow::ToUppercase UnibrowConverter;
- static bool ConvertAscii(char* dst, char* src, int length) {
- bool changed = false;
- for (int i = 0; i < length; ++i) {
- char c = src[i];
- if ('a' <= c && c <= 'z') {
- c -= ('a' - 'A');
- changed = true;
- }
- dst[i] = c;
- }
- return changed;
- }
+ typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;
};
} // namespace
@@ -4696,7 +4781,7 @@ static Object* ConvertCase(
Object* o = Heap::AllocateRawAsciiString(length);
if (o->IsFailure()) return o;
SeqAsciiString* result = SeqAsciiString::cast(o);
- bool has_changed_character = ConvertTraits::ConvertAscii(
+ bool has_changed_character = ConvertTraits::AsciiConverter::Convert(
result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);
return has_changed_character ? result : s;
}
« no previous file with comments | « src/globals.h ('k') | test/mjsunit/string-case.js » ('j') | test/mjsunit/string-case.js » ('J')

Powered by Google App Engine
This is Rietveld 408576698