src/opts/SkPx_sse.h - Issue 1317233005: SkPx: new approach to fixed-point SIMD

Side by Side Diff: src/opts/SkPx_sse.h

Issue 1317233005: SkPx: new approach to fixed-point SIMD (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: shl,shr Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2015 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkPx_sse_DEFINED

	9 #define SkPx_sse_DEFINED

	10

	11 // SkPx_sse's sweet spot is to work with 4 pixels at a time,

	12 // stored interlaced, just as they sit in memory: rgba rgba rgba rgba.

	13

	14 // SkPx_sse's best way to work with alphas is similar,

	15 // replicating the 4 alphas 4 times each across the pixel: aaaa aaaa aaaa aaaa.

	16

	17 // When working with fewer than 4 pixels, we load the pixels in the low lanes,

	18 // usually filling the top lanes with zeros (but who cares, might be junk).

	19

	20 struct SkPx_sse {

	21 static const int N = 4;

	22

	23 __m128i fVec;

	24 SkPx_sse(__m128i vec) : fVec(vec) {}

	25

	26 static SkPx_sse Dup(uint32_t px) { return _mm_set1_epi32(px); }

	27 static SkPx_sse Load(const uint32_t* px) { return _mm_loadu_si128((const __m 128i*)px); }

	28 static SkPx_sse Load(const uint32_t* px, int n) {

	29 SkASSERT(n > 0 && n < 4);

	30 switch (n) {

	31 case 1: return _mm_cvtsi32_si128(px[0]);

	32 case 2: return _mm_loadl_epi64((const __m128i*)px);

	33 case 3: return _mm_or_si128(_mm_loadl_epi64((const __m128i*)px),

	34 _mm_slli_si128(_mm_cvtsi32_si128(px[2]), 8));

	35 }

	36 return _mm_setzero_si128(); // Not actually reachable.

	37 }

	38

	39 void store(uint32_t* px) const { _mm_storeu_si128((__m128i*)px, fVec); }

	40 void store(uint32_t* px, int n) const {

	41 SkASSERT(n > 0 && n < 4);

	42 __m128i v = fVec;

	43 if (n & 1) {

	44 *px++ = _mm_cvtsi128_si32(v);

	45 v = _mm_srli_si128(v, 4);

	46 }

	47 if (n & 2) {

	48 _mm_storel_epi64((__m128i*)px, v);

	49 }

	50 }

	51

	52 struct Alpha {

	53 __m128i fVec;

	54 Alpha(__m128i vec) : fVec(vec) {}

	55

	56 static Alpha Dup(uint8_t a) { return _mm_set1_epi8(a); }

	57 static Alpha Load(const uint8_t* a) {

	58 __m128i as = _mm_cvtsi32_si128((const uint32_t)a); // ____ ____ ____ 3210

	59 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	60 return _mm_shuffle_epi8(as, _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0));

	61 #else

	62 as = _mm_unpacklo_epi8 (as, _mm_setzero_si128()); // ____ ____ _3_2 _1_0

	63 as = _mm_unpacklo_epi16(as, _mm_setzero_si128()); // ___3 ___2 ___1 ___0

	64 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00

	65 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000

	66 #endif

	67 }

	68 static Alpha Load(const uint8_t* a, int n) {

	69 SkASSERT(n > 0 && n < 4);

	70 uint8_t a4[] = { 0,0,0,0 };

	71 switch (n) {

	72 case 3: a4[2] = a[2]; // fall through

	73 case 2: a4[1] = a[1]; // fall through

	74 case 1: a4[0] = a[0];

	75 }

	76 return Load(a4);

	77 }

	78

	79 Alpha inv() const { return _mm_sub_epi8(_mm_set1_epi8(~0), fVec); }

	80 };

	81

	82 struct Wide {

	83 __m128i fLo, fHi;

	84 Wide(__m128i lo, __m128i hi) : fLo(lo), fHi(hi) {}

	85

	86 Wide operator+(const Wide& o) const {

	87 return Wide(_mm_add_epi16(fLo, o.fLo), _mm_add_epi16(fHi, o.fHi));

	88 }

	89 Wide operator-(const Wide& o) const {

	90 return Wide(_mm_sub_epi16(fLo, o.fLo), _mm_sub_epi16(fHi, o.fHi));

	91 }

	92 template <int bits> Wide shl() const {

	93 return Wide(_mm_slli_epi16(fLo, bits), _mm_slli_epi16(fHi, bits));

	94 }

	95 template <int bits> Wide shr() const {

	96 return Wide(_mm_srli_epi16(fLo, bits), _mm_srli_epi16(fHi, bits));

	97 }

	98

	99 SkPx_sse addNarrowHi(const SkPx_sse& o) const {

	100 Wide sum = (*this + o.widenLo()).shr<8>();

	101 return _mm_packus_epi16(sum.fLo, sum.fHi);

	102 }

	103 };

	104

	105 Alpha alpha() const {

	106 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	107 return _mm_shuffle_epi8(fVec, _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7 ,7,7, 3,3,3,3));

	108 #else

	109 __m128i as = _mm_srli_epi32(fVec, 24); // ___3 ___2 ___1 ___0

	110 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00

	111 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000

	112 #endif

	113 }

	114

	115 Wide widenLo() const {

	116 return Wide(_mm_unpacklo_epi8(fVec, _mm_setzero_si128()),

	117 _mm_unpackhi_epi8(fVec, _mm_setzero_si128()));

	118 }

	119 Wide widenHi() const {

	120 return Wide(_mm_unpacklo_epi8(_mm_setzero_si128(), fVec),

	121 _mm_unpackhi_epi8(_mm_setzero_si128(), fVec));

	122 }

	123 Wide widenLoHi() const {

	124 return Wide(_mm_unpacklo_epi8(fVec, fVec),

	125 _mm_unpackhi_epi8(fVec, fVec));

	126 }

	127

	128 SkPx_sse operator+(const SkPx_sse& o) const { return _mm_add_epi8(fVec, o .fVec); }

	129 SkPx_sse operator-(const SkPx_sse& o) const { return _mm_sub_epi8(fVec, o .fVec); }

	130 SkPx_sse saturatedAdd(const SkPx_sse& o) const { return _mm_adds_epi8(fVec, o.fVec); }

	131

	132 Wide operator*(const Alpha& a) const {

	133 __m128i pLo = _mm_unpacklo_epi8( fVec, _mm_setzero_si128()),

	134 aLo = _mm_unpacklo_epi8(a.fVec, _mm_setzero_si128()),

	135 pHi = _mm_unpackhi_epi8( fVec, _mm_setzero_si128()),

	136 aHi = _mm_unpackhi_epi8(a.fVec, _mm_setzero_si128());

	137 return Wide(_mm_mullo_epi16(pLo, aLo), _mm_mullo_epi16(pHi, aHi));

	138 }

	139 SkPx_sse approxMulDiv255(const Alpha& a) const {

	140 return (this a).addNarrowHi(*this);

	141 }

	142

	143 SkPx_sse addAlpha(const Alpha& a) const {

	144 return _mm_add_epi8(fVec, _mm_and_si128(a.fVec, _mm_set1_epi32(0xFF00000 0)));

	145 }

	146 };

	147

	148 typedef SkPx_sse SkPx;

	149

	150 #endif//SkPx_sse_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkPx_none.h ('k') | no next file » | no next file with comments »