src/opts/SkNx_avx.h - Issue 1432903002: float xfermodes (burn, dodge, softlight) in Sk8f, possibly using AVX.

Side by Side Diff: src/opts/SkNx_avx.h

Issue 1432903002: float xfermodes (burn, dodge, softlight) in Sk8f, possibly using AVX. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: 1.0f/255 Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2015 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkNx_avx_DEFINED

	9 #define SkNx_avx_DEFINED

	10

	11 // This file may assume <= AVX, but must check SK_CPU_SSE_LEVEL for anything mor e recent.

	12

	13 // All the SSE specializations are still good ideas. We'll just add Sk8f.

	14 #include "SkNx_sse.h"

	15

	16 namespace { // See SkNx.h

	17

	18 template <>

	19 class SkNf<8> {

	20 public:

	21 SkNf(const __m256& vec) : fVec(vec) {}

	22

	23 SkNf() {}

	24 SkNf(float val) : fVec(_mm256_set1_ps(val)) {}

	25 static SkNf Load(const float vals[8]) { return _mm256_loadu_ps(vals); }

	26

	27 static SkNf FromBytes(const uint8_t bytes[8]) {

	28 __m128i fix8 = _mm_loadl_epi64((const __m128i*)bytes),

	29 fix16 = _mm_unpacklo_epi8 (fix8 , _mm_setzero_si128()),

	30 lo32 = _mm_unpacklo_epi16(fix16, _mm_setzero_si128()),

	31 hi32 = _mm_unpackhi_epi16(fix16, _mm_setzero_si128());

	32 __m256i fix32 = _mm256_insertf128_si256(_mm256_castsi128_si256(lo32), hi 32, 1);
	msarett 2015/11/09 23:25:06 Seems annoying to do all the unpacking in 128-bit Seems annoying to do all the unpacking in 128-bit and then combine to 256-bits. But I guess we don't get 256-bit integer unpack/shuffle instructions until AVX2. mtklein 2015/11/10 00:22:05 Right. Even then, the AVX and AVX2 packing, unpac Show quoted text On 2015/11/09 at 23:25:06, msarett wrote: > Seems annoying to do all the unpacking in 128-bit and then combine to 256-bits. But I guess we don't get 256-bit integer unpack/shuffle instructions until AVX2. Right. Even then, the AVX and AVX2 packing, unpacking, and shuffling instructions can't cross from the low 128 bits to the high 128 bits or the other way around. They basically only work on 2 128-bit arguments in parallel. They're depressingly disappointing. E.g., take a look at the pseudocode for vpshufb... at first you think, that looks weird, why is it written so complicated... and then you're like, oh, it just sucks that's why: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3868,474... The only unpacking instruction that's really useful for us and that works in the way you'd hope is vpmovzxbd to zero-extend 8 bytes to 8 ints, but that's AVX2. That'd get us right from fix8 to fix32 in this code (and I think can even take a memory argument to read from, compiling the entire function up to __m256i fix32 = ...; into a single instruction). msarett 2015/11/10 14:54:16 Good point, forgot about that. Show quoted text On 2015/11/10 00:22:05, mtklein wrote: > On 2015/11/09 at 23:25:06, msarett wrote: > > Seems annoying to do all the unpacking in 128-bit and then combine to > 256-bits. But I guess we don't get 256-bit integer unpack/shuffle instructions > until AVX2. > > Right. Even then, the AVX and AVX2 packing, unpacking, and shuffling > instructions can't cross from the low 128 bits to the high 128 bits or the other > way around. They basically only work on 2 128-bit arguments in parallel. > They're depressingly disappointing. E.g., take a look at the pseudocode for > vpshufb... at first you think, that looks weird, why is it written so > complicated... and then you're like, oh, it just sucks that's why: > https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3868,474... > Good point, forgot about that. Show quoted text > The only unpacking instruction that's really useful for us and that works in the > way you'd hope is vpmovzxbd to zero-extend 8 bytes to 8 ints, but that's AVX2. > That'd get us right from fix8 to fix32 in this code (and I think can even take a > memory argument to read from, compiling the entire function up to __m256i fix32 > = ...; into a single instruction). Cool!
	33 return _mm256_cvtepi32_ps(fix32);

	34 }

	35

	36 SkNf(float a, float b, float c, float d,

	37 float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,d,e,f,g ,h)) {}

	38

	39 void store(float vals[8]) const { _mm256_storeu_ps(vals, fVec); }

	40 void toBytes(uint8_t bytes[8]) const {

	41 __m256i fix32 = _mm256_cvttps_epi32(fVec);

	42 __m128i lo32 = _mm256_extractf128_si256(fix32, 0),

	43 hi32 = _mm256_extractf128_si256(fix32, 1),

	44 fix16 = _mm_packus_epi32(lo32, hi32),

	45 fix8 = _mm_packus_epi16(fix16, fix16);

	46 _mm_storel_epi64((__m128i*)bytes, fix8);

	47 }

	48

	49 SkNf operator + (const SkNf& o) const { return _mm256_add_ps(fVec, o.fVec); }

	50 SkNf operator - (const SkNf& o) const { return _mm256_sub_ps(fVec, o.fVec); }

	51 SkNf operator * (const SkNf& o) const { return _mm256_mul_ps(fVec, o.fVec); }

	52 SkNf operator / (const SkNf& o) const { return _mm256_div_ps(fVec, o.fVec); }

	53

	54 SkNf operator == (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_EQ_OQ); }

	55 SkNf operator != (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_NEQ_OQ); }

	56 SkNf operator < (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LT_OQ); }

	57 SkNf operator > (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GT_OQ); }

	58 SkNf operator <= (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LE_OQ); }

	59 SkNf operator >= (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GE_OQ); }

	60

	61 static SkNf Min(const SkNf& l, const SkNf& r) { return _mm256_min_ps(l.fVec, r.fVec); }

	62 static SkNf Max(const SkNf& l, const SkNf& r) { return _mm256_max_ps(l.fVec, r.fVec); }

	63

	64 SkNf sqrt() const { return _mm256_sqrt_ps (fVec); }

	65 SkNf rsqrt0() const { return _mm256_rsqrt_ps(fVec); }

	66 SkNf rsqrt1() const { return this->rsqrt0(); }

	67 SkNf rsqrt2() const { return this->rsqrt1(); }

	68

	69 SkNf invert() const { return SkNf(1) / *this; }

	70 SkNf approxInvert() const { return _mm256_rcp_ps(fVec); }

	71

	72 template <int k> float kth() const {

	73 SkASSERT(0 <= k && k < 8);

	74 union { __m256 v; float fs[8]; } pun = {fVec};

	75 return pun.fs[k&7];

	76 }

	77

	78 bool allTrue() const { return 0xff == _mm256_movemask_ps(fVec); }

	79 bool anyTrue() const { return 0x00 != _mm256_movemask_ps(fVec); }

	80

	81 SkNf thenElse(const SkNf& t, const SkNf& e) const {

	82 return _mm256_blendv_ps(e.fVec, t.fVec, fVec);

	83 }

	84

	85 __m256 fVec;

	86 };

	87

	88 } // namespace

	89

	90 #endif//SkNx_avx_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkNx_neon.h » ('j') | src/opts/SkXfermode_opts.h » ('J')