OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2015 Google Inc. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #ifndef SkPx_sse_DEFINED | |
9 #define SkPx_sse_DEFINED | |
10 | |
11 // SkPx_sse's sweet spot is to work with 4 pixels at a time, | |
12 // stored interlaced, just as they sit in memory: rgba rgba rgba rgba. | |
13 | |
14 // SkPx_sse's best way to work with alphas is similar, | |
15 // replicating the 4 alphas 4 times each across the pixel: aaaa aaaa aaaa aaaa. | |
16 | |
17 // When working with fewer than 4 pixels, we load the pixels in the low lanes, | |
18 // usually filling the top lanes with zeros (but who cares, might be junk). | |
19 | |
20 struct SkPx_sse { | |
21 static const int N = 4; | |
22 | |
23 __m128i fVec; | |
24 SkPx_sse(__m128i vec) : fVec(vec) {} | |
25 | |
26 static SkPx_sse Dup(uint32_t px) { return _mm_set1_epi32(px); } | |
27 static SkPx_sse LoadN(const uint32_t* px) { return _mm_loadu_si128((const __
m128i*)px); } | |
28 static SkPx_sse Load(int n, const uint32_t* px) { | |
29 SkASSERT(n > 0 && n < 4); | |
30 switch (n) { | |
31 case 1: return _mm_cvtsi32_si128(px[0]); | |
32 case 2: return _mm_loadl_epi64((const __m128i*)px); | |
33 case 3: return _mm_or_si128(_mm_loadl_epi64((const __m128i*)px), | |
34 _mm_slli_si128(_mm_cvtsi32_si128(px[2]),
8)); | |
35 } | |
36 return _mm_setzero_si128(); // Not actually reachable. | |
37 } | |
38 | |
39 void storeN(uint32_t* px) const { _mm_storeu_si128((__m128i*)px, fVec); } | |
40 void store(int n, uint32_t* px) const { | |
41 SkASSERT(n > 0 && n < 4); | |
42 __m128i v = fVec; | |
43 if (n & 1) { | |
44 *px++ = _mm_cvtsi128_si32(v); | |
45 v = _mm_srli_si128(v, 4); | |
46 } | |
47 if (n & 2) { | |
48 _mm_storel_epi64((__m128i*)px, v); | |
49 } | |
50 } | |
51 | |
52 struct Alpha { | |
53 __m128i fVec; | |
54 Alpha(__m128i vec) : fVec(vec) {} | |
55 | |
56 static Alpha Dup(uint8_t a) { return _mm_set1_epi8(a); } | |
57 static Alpha LoadN(const uint8_t* a) { | |
58 __m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a); // ____ ____
____ 3210 | |
59 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
60 return _mm_shuffle_epi8(as, _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1,
0,0,0,0)); | |
61 #else | |
62 as = _mm_unpacklo_epi8 (as, _mm_setzero_si128()); // ____ ____
_3_2 _1_0 | |
63 as = _mm_unpacklo_epi16(as, _mm_setzero_si128()); // ___3 ___2
___1 ___0 | |
64 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22
__11 __00 | |
65 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222
1111 0000 | |
66 #endif | |
67 } | |
68 static Alpha Load(int n, const uint8_t* a) { | |
69 SkASSERT(n > 0 && n < 4); | |
70 uint8_t a4[] = { 0,0,0,0 }; | |
71 switch (n) { | |
72 case 3: a4[2] = a[2]; // fall through | |
73 case 2: a4[1] = a[1]; // fall through | |
74 case 1: a4[0] = a[0]; | |
75 } | |
76 return LoadN(a4); | |
77 } | |
78 | |
79 Alpha inv() const { return _mm_sub_epi8(_mm_set1_epi8(~0), fVec); } | |
80 }; | |
81 | |
82 struct Wide { | |
83 __m128i fLo, fHi; | |
84 Wide(__m128i lo, __m128i hi) : fLo(lo), fHi(hi) {} | |
85 | |
86 Wide operator+(const Wide& o) const { | |
87 return Wide(_mm_add_epi16(fLo, o.fLo), _mm_add_epi16(fHi, o.fHi)); | |
88 } | |
89 Wide operator-(const Wide& o) const { | |
90 return Wide(_mm_sub_epi16(fLo, o.fLo), _mm_sub_epi16(fHi, o.fHi)); | |
91 } | |
92 Wide operator<<(int bits) const { | |
93 return Wide(_mm_slli_epi16(fLo, bits), _mm_slli_epi16(fHi, bits)); | |
94 } | |
95 Wide operator>>(int bits) const { | |
96 return Wide(_mm_srli_epi16(fLo, bits), _mm_srli_epi16(fHi, bits)); | |
97 } | |
98 | |
99 SkPx_sse addNarrowHi(const SkPx_sse& o) const { | |
100 Wide sum = (*this + o.widenLo()) >> 8; | |
101 return _mm_packus_epi16(sum.fLo, sum.fHi); | |
102 } | |
103 }; | |
104 | |
105 Alpha alpha() const { | |
106 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
107 return _mm_shuffle_epi8(fVec, _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7
,7,7, 3,3,3,3)); | |
108 #else | |
109 __m128i as = _mm_srli_epi32(fVec, 24); // ___3 ___2 ___1 ___0 | |
110 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00 | |
111 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000 | |
112 #endif | |
113 } | |
114 | |
115 Wide widenLo() const { | |
116 return Wide(_mm_unpacklo_epi8(fVec, _mm_setzero_si128()), | |
117 _mm_unpackhi_epi8(fVec, _mm_setzero_si128())); | |
118 } | |
119 Wide widenHi() const { | |
120 return Wide(_mm_unpacklo_epi8(_mm_setzero_si128(), fVec), | |
121 _mm_unpackhi_epi8(_mm_setzero_si128(), fVec)); | |
122 } | |
123 Wide widenLoHi() const { | |
124 return Wide(_mm_unpacklo_epi8(fVec, fVec), | |
125 _mm_unpackhi_epi8(fVec, fVec)); | |
126 } | |
127 | |
128 SkPx_sse operator+(const SkPx_sse& o) const { return _mm_add_epi8(fVec, o
.fVec); } | |
129 SkPx_sse operator-(const SkPx_sse& o) const { return _mm_sub_epi8(fVec, o
.fVec); } | |
130 SkPx_sse saturatedAdd(const SkPx_sse& o) const { return _mm_adds_epi8(fVec,
o.fVec); } | |
131 | |
132 Wide operator*(const Alpha& a) const { | |
133 __m128i pLo = _mm_unpacklo_epi8( fVec, _mm_setzero_si128()), | |
134 aLo = _mm_unpacklo_epi8(a.fVec, _mm_setzero_si128()), | |
135 pHi = _mm_unpackhi_epi8( fVec, _mm_setzero_si128()), | |
136 aHi = _mm_unpackhi_epi8(a.fVec, _mm_setzero_si128()); | |
137 return Wide(_mm_mullo_epi16(pLo, aLo), _mm_mullo_epi16(pHi, aHi)); | |
138 } | |
139 SkPx_sse approxMulDiv255(const Alpha& a) const { | |
140 return (*this * a).addNarrowHi(*this); | |
141 } | |
142 | |
143 SkPx_sse addAlpha(const Alpha& a) const { | |
144 return _mm_add_epi8(fVec, _mm_and_si128(a.fVec, _mm_set1_epi32(0xFF00000
0))); | |
145 } | |
146 }; | |
147 | |
148 typedef SkPx_sse SkPx; | |
149 | |
150 #endif//SkPx_sse_DEFINED | |
OLD | NEW |