Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Side by Side Diff: bench/pack_int_uint16_t_Bench.cpp

Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: so tired of this MSVC... Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/opts/SkNx_sse.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "Benchmark.h"
9 #include "SkTypes.h"
10
11 /**
12 * There's a good variety of ways to pack from int down to uint16_t with SSE,
13 * depending on the specific instructions available.
14 *
15 * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways:
16 * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a)
17 * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)
18 * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3)
19 * SSE41 added an int -> uint16_t pack instruction. (sse41)
20 *
21 * Findings so far:
22 * - sse41 < ssse3 <<< sse2_b < sse2_a;
23 * - the ssse3 version is only slightly slower than the sse41 version, maybe n ot at all
24 * - the sse2_a is only slightly slower than the sse2_b version
25 * - the ssse3 and sse41 versions are about 3x faster than either sse2 version
26 * - the sse41 version seems to cause some code generation trouble.
27 */
28
29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
30
31 #include <immintrin.h>
32
33 template <__m128i (kernel)(__m128i)>
34 class pack_int_uint16_t_Bench : public Benchmark {
35 public:
36 pack_int_uint16_t_Bench(const char* impl) {
37 fName.append("pack_int_uint16_t_");
38 fName.append(impl);
39 }
40
41 bool isSuitableFor(Backend backend) override { return backend == kNonRenderi ng_Backend; }
42 const char* onGetName() override { return fName.c_str(); }
43
44 void onDraw(int loops, SkCanvas*) override {
45 __m128i x = _mm_set1_epi32(0x42424242);
46 while (loops --> 0) {
47 x = kernel(x);
48 }
49
50 volatile int blackhole = 0;
51 blackhole ^= _mm_cvtsi128_si32(x);
52 }
53
54 SkString fName;
55 };
56
57 namespace {
58 __m128i sse2_a(__m128i x) {
59 x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));
60 return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000) );
61 }
62 }
63 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )
64
65 namespace {
66 __m128i sse2_b(__m128i x) {
67 x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
68 return _mm_packs_epi32(x,x);
69 }
70 }
71 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )
72
73 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
74 namespace {
75 __m128i ssse3(__m128i x) {
76 // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic.
77 const int _ = ~0;
78 return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_ ,_,_,_));
79 }
80 }
81 DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )
82 #endif
83
84 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85 namespace {
86 __m128i sse41(__m128i x) {
87 return _mm_packus_epi32(x,x);
88 }
89 }
90 DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )
91 #endif
92
93 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
OLDNEW
« no previous file with comments | « no previous file | src/opts/SkNx_sse.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698