| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright 2015 Google Inc. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #ifndef SkPx_sse_DEFINED | |
| 9 #define SkPx_sse_DEFINED | |
| 10 | |
| 11 // sse::SkPx's sweet spot is to work with 4 pixels at a time, | |
| 12 // stored interlaced, just as they sit in memory: rgba rgba rgba rgba. | |
| 13 | |
| 14 // sse::SkPx's best way to work with alphas is similar, | |
| 15 // replicating the 4 alphas 4 times each across the pixel: aaaa aaaa aaaa aaaa. | |
| 16 | |
| 17 // When working with fewer than 4 pixels, we load the pixels in the low lanes, | |
| 18 // usually filling the top lanes with zeros (but who cares, might be junk). | |
| 19 | |
| 20 namespace sse { | |
| 21 | |
| 22 struct SkPx { | |
| 23 static const int N = 4; | |
| 24 | |
| 25 __m128i fVec; | |
| 26 SkPx(__m128i vec) : fVec(vec) {} | |
| 27 | |
| 28 static SkPx Dup(uint32_t px) { return _mm_set1_epi32(px); } | |
| 29 static SkPx Load(const uint32_t* px) { return _mm_loadu_si128((const __m128i
*)px); } | |
| 30 static SkPx Load(const uint32_t* px, int n) { | |
| 31 SkASSERT(n > 0 && n < 4); | |
| 32 switch (n) { | |
| 33 case 1: return _mm_cvtsi32_si128(px[0]); | |
| 34 case 2: return _mm_loadl_epi64((const __m128i*)px); | |
| 35 case 3: return _mm_or_si128(_mm_loadl_epi64((const __m128i*)px), | |
| 36 _mm_slli_si128(_mm_cvtsi32_si128(px[2]),
8)); | |
| 37 } | |
| 38 return _mm_setzero_si128(); // Not actually reachable. | |
| 39 } | |
| 40 | |
| 41 void store(uint32_t* px) const { _mm_storeu_si128((__m128i*)px, fVec); } | |
| 42 void store(uint32_t* px, int n) const { | |
| 43 SkASSERT(n > 0 && n < 4); | |
| 44 __m128i v = fVec; | |
| 45 if (n & 1) { | |
| 46 *px++ = _mm_cvtsi128_si32(v); | |
| 47 v = _mm_srli_si128(v, 4); | |
| 48 } | |
| 49 if (n & 2) { | |
| 50 _mm_storel_epi64((__m128i*)px, v); | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 struct Alpha { | |
| 55 __m128i fVec; | |
| 56 Alpha(__m128i vec) : fVec(vec) {} | |
| 57 | |
| 58 static Alpha Dup(uint8_t a) { return _mm_set1_epi8(a); } | |
| 59 static Alpha Load(const uint8_t* a) { | |
| 60 __m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a); // ____ ____
____ 3210 | |
| 61 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 62 return _mm_shuffle_epi8(as, _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1,
0,0,0,0)); | |
| 63 #else | |
| 64 as = _mm_unpacklo_epi8 (as, as); // ____ ____
3322 1100 | |
| 65 as = _mm_unpacklo_epi16(as, as); // 3333 2222
1111 0000 | |
| 66 return as; | |
| 67 #endif | |
| 68 } | |
| 69 static Alpha Load(const uint8_t* a, int n) { | |
| 70 SkASSERT(n > 0 && n < 4); | |
| 71 uint8_t a4[] = { 0,0,0,0 }; | |
| 72 switch (n) { | |
| 73 case 3: a4[2] = a[2]; // fall through | |
| 74 case 2: a4[1] = a[1]; // fall through | |
| 75 case 1: a4[0] = a[0]; | |
| 76 } | |
| 77 return Load(a4); | |
| 78 } | |
| 79 | |
| 80 Alpha inv() const { return _mm_sub_epi8(_mm_set1_epi8(~0), fVec); } | |
| 81 }; | |
| 82 | |
| 83 struct Wide { | |
| 84 __m128i fLo, fHi; | |
| 85 Wide(__m128i lo, __m128i hi) : fLo(lo), fHi(hi) {} | |
| 86 | |
| 87 Wide operator+(const Wide& o) const { | |
| 88 return Wide(_mm_add_epi16(fLo, o.fLo), _mm_add_epi16(fHi, o.fHi)); | |
| 89 } | |
| 90 Wide operator-(const Wide& o) const { | |
| 91 return Wide(_mm_sub_epi16(fLo, o.fLo), _mm_sub_epi16(fHi, o.fHi)); | |
| 92 } | |
| 93 template <int bits> Wide shl() const { | |
| 94 return Wide(_mm_slli_epi16(fLo, bits), _mm_slli_epi16(fHi, bits)); | |
| 95 } | |
| 96 template <int bits> Wide shr() const { | |
| 97 return Wide(_mm_srli_epi16(fLo, bits), _mm_srli_epi16(fHi, bits)); | |
| 98 } | |
| 99 | |
| 100 SkPx addNarrowHi(const SkPx& o) const { | |
| 101 Wide sum = (*this + o.widenLo()).shr<8>(); | |
| 102 return _mm_packus_epi16(sum.fLo, sum.fHi); | |
| 103 } | |
| 104 }; | |
| 105 | |
| 106 Alpha alpha() const { | |
| 107 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 108 return _mm_shuffle_epi8(fVec, _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7
,7,7, 3,3,3,3)); | |
| 109 #else | |
| 110 // We exploit that A >= rgb for any premul pixel. | |
| 111 __m128i as = fVec; // 3xxx 2xxx 1xxx 0xxx | |
| 112 as = _mm_max_epu8(as, _mm_srli_epi32(as, 8)); // 33xx 22xx 11xx 00xx | |
| 113 as = _mm_max_epu8(as, _mm_srli_epi32(as, 16)); // 3333 2222 1111 0000 | |
| 114 return as; | |
| 115 #endif | |
| 116 } | |
| 117 | |
| 118 Wide widenLo() const { | |
| 119 return Wide(_mm_unpacklo_epi8(fVec, _mm_setzero_si128()), | |
| 120 _mm_unpackhi_epi8(fVec, _mm_setzero_si128())); | |
| 121 } | |
| 122 Wide widenHi() const { | |
| 123 return Wide(_mm_unpacklo_epi8(_mm_setzero_si128(), fVec), | |
| 124 _mm_unpackhi_epi8(_mm_setzero_si128(), fVec)); | |
| 125 } | |
| 126 Wide widenLoHi() const { | |
| 127 return Wide(_mm_unpacklo_epi8(fVec, fVec), | |
| 128 _mm_unpackhi_epi8(fVec, fVec)); | |
| 129 } | |
| 130 | |
| 131 SkPx operator+(const SkPx& o) const { return _mm_add_epi8(fVec, o.fVec);
} | |
| 132 SkPx operator-(const SkPx& o) const { return _mm_sub_epi8(fVec, o.fVec);
} | |
| 133 SkPx saturatedAdd(const SkPx& o) const { return _mm_adds_epi8(fVec, o.fVec);
} | |
| 134 | |
| 135 Wide operator*(const Alpha& a) const { | |
| 136 __m128i pLo = _mm_unpacklo_epi8( fVec, _mm_setzero_si128()), | |
| 137 aLo = _mm_unpacklo_epi8(a.fVec, _mm_setzero_si128()), | |
| 138 pHi = _mm_unpackhi_epi8( fVec, _mm_setzero_si128()), | |
| 139 aHi = _mm_unpackhi_epi8(a.fVec, _mm_setzero_si128()); | |
| 140 return Wide(_mm_mullo_epi16(pLo, aLo), _mm_mullo_epi16(pHi, aHi)); | |
| 141 } | |
| 142 SkPx approxMulDiv255(const Alpha& a) const { | |
| 143 return (*this * a).addNarrowHi(*this); | |
| 144 } | |
| 145 | |
| 146 SkPx addAlpha(const Alpha& a) const { | |
| 147 return _mm_add_epi8(fVec, _mm_and_si128(a.fVec, _mm_set1_epi32(0xFF00000
0))); | |
| 148 } | |
| 149 }; | |
| 150 | |
| 151 } // namespace sse | |
| 152 | |
| 153 typedef sse::SkPx SkPx; | |
| 154 | |
| 155 #endif//SkPx_sse_DEFINED | |
| OLD | NEW |