bench/pack_int_uint16_t_Bench.cpp - Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE.

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Side by Side Diff: bench/pack_int_uint16_t_Bench.cpp

Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: so tired of this MSVC... Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #include "Benchmark.h"

	9 #include "SkTypes.h"

	10

	11 /**

	12 * There's a good variety of ways to pack from int down to uint16_t with SSE,

	13 * depending on the specific instructions available.

	14 *

	15 * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways:

	16 * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a)

	17 * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)

	18 * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3)

	19 * SSE41 added an int -> uint16_t pack instruction. (sse41)

	20 *

	21 * Findings so far:

	22 * - sse41 < ssse3 <<< sse2_b < sse2_a;

	23 * - the ssse3 version is only slightly slower than the sse41 version, maybe n ot at all

	24 * - the sse2_a is only slightly slower than the sse2_b version

	25 * - the ssse3 and sse41 versions are about 3x faster than either sse2 version

	26 * - the sse41 version seems to cause some code generation trouble.

	27 */

	28

	29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	30

	31 #include <immintrin.h>

	32

	33 template <__m128i (kernel)(__m128i)>

	34 class pack_int_uint16_t_Bench : public Benchmark {

	35 public:

	36 pack_int_uint16_t_Bench(const char* impl) {

	37 fName.append("pack_int_uint16_t_");

	38 fName.append(impl);

	39 }

	40

	41 bool isSuitableFor(Backend backend) override { return backend == kNonRenderi ng_Backend; }

	42 const char* onGetName() override { return fName.c_str(); }

	43

	44 void onDraw(int loops, SkCanvas*) override {

	45 __m128i x = _mm_set1_epi32(0x42424242);

	46 while (loops --> 0) {

	47 x = kernel(x);

	48 }

	49

	50 volatile int blackhole = 0;

	51 blackhole ^= _mm_cvtsi128_si32(x);

	52 }

	53

	54 SkString fName;

	55 };

	56

	57 namespace {

	58 __m128i sse2_a(__m128i x) {

	59 x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));

	60 return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000) );

	61 }

	62 }

	63 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )

	64

	65 namespace {

	66 __m128i sse2_b(__m128i x) {

	67 x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);

	68 return _mm_packs_epi32(x,x);

	69 }

	70 }

	71 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )

	72

	73 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	74 namespace {

	75 __m128i ssse3(__m128i x) {

	76 // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic.

	77 const int _ = ~0;

	78 return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_ ,_,_,_));

	79 }

	80 }

	81 DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )

	82 #endif

	83

	84 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

	85 namespace {

	86 __m128i sse41(__m128i x) {

	87 return _mm_packus_epi32(x,x);

	88 }

	89 }

	90 DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )

	91 #endif

	92

	93 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

OLD	NEW

« no previous file with comments | « no previous file | src/opts/SkNx_sse.h » ('j') | no next file with comments »