Index: bench/pack_int_uint16_t_Bench.cpp |
diff --git a/bench/pack_int_uint16_t_Bench.cpp b/bench/pack_int_uint16_t_Bench.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..5e1527d7ca25c2cc6a761ddf7801be0293bfe134 |
--- /dev/null |
+++ b/bench/pack_int_uint16_t_Bench.cpp |
@@ -0,0 +1,93 @@ |
+/* |
+ * Copyright 2016 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#include "Benchmark.h" |
+#include "SkTypes.h" |
+ |
+/** |
+ * There's a good variety of ways to pack from int down to uint16_t with SSE, |
+ * depending on the specific instructions available. |
+ * |
+ * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways: |
+ * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a) |
+ * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b) |
+ * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3) |
+ * SSE41 added an int -> uint16_t pack instruction. (sse41) |
+ * |
+ * Findings so far: |
+ * - sse41 < ssse3 <<< sse2_b < sse2_a; |
+ * - the ssse3 version is only slightly slower than the sse41 version, maybe not at all |
+ * - the sse2_a is only slightly slower than the sse2_b version |
+ * - the ssse3 and sse41 versions are about 3x faster than either sse2 version |
+ * - the sse41 version seems to cause some code generation trouble. |
+ */ |
+ |
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
+ |
+#include <immintrin.h> |
+ |
+template <__m128i (kernel)(__m128i)> |
+class pack_int_uint16_t_Bench : public Benchmark { |
+public: |
+ pack_int_uint16_t_Bench(const char* impl) { |
+ fName.append("pack_int_uint16_t_"); |
+ fName.append(impl); |
+ } |
+ |
+ bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } |
+ const char* onGetName() override { return fName.c_str(); } |
+ |
+ void onDraw(int loops, SkCanvas*) override { |
+ __m128i x = _mm_set1_epi32(0x42424242); |
+ while (loops --> 0) { |
+ x = kernel(x); |
+ } |
+ |
+ volatile int blackhole = 0; |
+ blackhole ^= _mm_cvtsi128_si32(x); |
+ } |
+ |
+ SkString fName; |
+}; |
+ |
+namespace { |
+ __m128i sse2_a(__m128i x) { |
+ x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000)); |
+ return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000)); |
+ } |
+} |
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); ) |
+ |
+namespace { |
+ __m128i sse2_b(__m128i x) { |
+ x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); |
+ return _mm_packs_epi32(x,x); |
+ } |
+} |
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); ) |
+ |
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
+namespace { |
+ __m128i ssse3(__m128i x) { |
+ // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic. |
+ const int _ = ~0; |
+ return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); |
+ } |
+} |
+DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); ) |
+#endif |
+ |
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
+namespace { |
+ __m128i sse41(__m128i x) { |
+ return _mm_packus_epi32(x,x); |
+ } |
+} |
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); ) |
+#endif |
+ |
+#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |