OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkPx_sse_DEFINED | 8 #ifndef SkPx_sse_DEFINED |
9 #define SkPx_sse_DEFINED | 9 #define SkPx_sse_DEFINED |
10 | 10 |
11 // SkPx_sse's sweet spot is to work with 4 pixels at a time, | 11 // sse::SkPx's sweet spot is to work with 4 pixels at a time, |
12 // stored interlaced, just as they sit in memory: rgba rgba rgba rgba. | 12 // stored interlaced, just as they sit in memory: rgba rgba rgba rgba. |
13 | 13 |
14 // SkPx_sse's best way to work with alphas is similar, | 14 // sse::SkPx's best way to work with alphas is similar, |
15 // replicating the 4 alphas 4 times each across the pixel: aaaa aaaa aaaa aaaa. | 15 // replicating the 4 alphas 4 times each across the pixel: aaaa aaaa aaaa aaaa. |
16 | 16 |
17 // When working with fewer than 4 pixels, we load the pixels in the low lanes, | 17 // When working with fewer than 4 pixels, we load the pixels in the low lanes, |
18 // usually filling the top lanes with zeros (but who cares, might be junk). | 18 // usually filling the top lanes with zeros (but who cares, might be junk). |
19 | 19 |
20 struct SkPx_sse { | 20 namespace sse { |
| 21 |
| 22 struct SkPx { |
21 static const int N = 4; | 23 static const int N = 4; |
22 | 24 |
23 __m128i fVec; | 25 __m128i fVec; |
24 SkPx_sse(__m128i vec) : fVec(vec) {} | 26 SkPx(__m128i vec) : fVec(vec) {} |
25 | 27 |
26 static SkPx_sse Dup(uint32_t px) { return _mm_set1_epi32(px); } | 28 static SkPx Dup(uint32_t px) { return _mm_set1_epi32(px); } |
27 static SkPx_sse Load(const uint32_t* px) { return _mm_loadu_si128((const __m
128i*)px); } | 29 static SkPx Load(const uint32_t* px) { return _mm_loadu_si128((const __m128i
*)px); } |
28 static SkPx_sse Load(const uint32_t* px, int n) { | 30 static SkPx Load(const uint32_t* px, int n) { |
29 SkASSERT(n > 0 && n < 4); | 31 SkASSERT(n > 0 && n < 4); |
30 switch (n) { | 32 switch (n) { |
31 case 1: return _mm_cvtsi32_si128(px[0]); | 33 case 1: return _mm_cvtsi32_si128(px[0]); |
32 case 2: return _mm_loadl_epi64((const __m128i*)px); | 34 case 2: return _mm_loadl_epi64((const __m128i*)px); |
33 case 3: return _mm_or_si128(_mm_loadl_epi64((const __m128i*)px), | 35 case 3: return _mm_or_si128(_mm_loadl_epi64((const __m128i*)px), |
34 _mm_slli_si128(_mm_cvtsi32_si128(px[2]),
8)); | 36 _mm_slli_si128(_mm_cvtsi32_si128(px[2]),
8)); |
35 } | 37 } |
36 return _mm_setzero_si128(); // Not actually reachable. | 38 return _mm_setzero_si128(); // Not actually reachable. |
37 } | 39 } |
38 | 40 |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
89 Wide operator-(const Wide& o) const { | 91 Wide operator-(const Wide& o) const { |
90 return Wide(_mm_sub_epi16(fLo, o.fLo), _mm_sub_epi16(fHi, o.fHi)); | 92 return Wide(_mm_sub_epi16(fLo, o.fLo), _mm_sub_epi16(fHi, o.fHi)); |
91 } | 93 } |
92 template <int bits> Wide shl() const { | 94 template <int bits> Wide shl() const { |
93 return Wide(_mm_slli_epi16(fLo, bits), _mm_slli_epi16(fHi, bits)); | 95 return Wide(_mm_slli_epi16(fLo, bits), _mm_slli_epi16(fHi, bits)); |
94 } | 96 } |
95 template <int bits> Wide shr() const { | 97 template <int bits> Wide shr() const { |
96 return Wide(_mm_srli_epi16(fLo, bits), _mm_srli_epi16(fHi, bits)); | 98 return Wide(_mm_srli_epi16(fLo, bits), _mm_srli_epi16(fHi, bits)); |
97 } | 99 } |
98 | 100 |
99 SkPx_sse addNarrowHi(const SkPx_sse& o) const { | 101 SkPx addNarrowHi(const SkPx& o) const { |
100 Wide sum = (*this + o.widenLo()).shr<8>(); | 102 Wide sum = (*this + o.widenLo()).shr<8>(); |
101 return _mm_packus_epi16(sum.fLo, sum.fHi); | 103 return _mm_packus_epi16(sum.fLo, sum.fHi); |
102 } | 104 } |
103 }; | 105 }; |
104 | 106 |
105 Alpha alpha() const { | 107 Alpha alpha() const { |
106 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 108 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
107 return _mm_shuffle_epi8(fVec, _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7
,7,7, 3,3,3,3)); | 109 return _mm_shuffle_epi8(fVec, _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7
,7,7, 3,3,3,3)); |
108 #else | 110 #else |
109 __m128i as = _mm_srli_epi32(fVec, 24); // ___3 ___2 ___1 ___0 | 111 __m128i as = _mm_srli_epi32(fVec, 24); // ___3 ___2 ___1 ___0 |
110 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00 | 112 as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00 |
111 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000 | 113 return _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000 |
112 #endif | 114 #endif |
113 } | 115 } |
114 | 116 |
115 Wide widenLo() const { | 117 Wide widenLo() const { |
116 return Wide(_mm_unpacklo_epi8(fVec, _mm_setzero_si128()), | 118 return Wide(_mm_unpacklo_epi8(fVec, _mm_setzero_si128()), |
117 _mm_unpackhi_epi8(fVec, _mm_setzero_si128())); | 119 _mm_unpackhi_epi8(fVec, _mm_setzero_si128())); |
118 } | 120 } |
119 Wide widenHi() const { | 121 Wide widenHi() const { |
120 return Wide(_mm_unpacklo_epi8(_mm_setzero_si128(), fVec), | 122 return Wide(_mm_unpacklo_epi8(_mm_setzero_si128(), fVec), |
121 _mm_unpackhi_epi8(_mm_setzero_si128(), fVec)); | 123 _mm_unpackhi_epi8(_mm_setzero_si128(), fVec)); |
122 } | 124 } |
123 Wide widenLoHi() const { | 125 Wide widenLoHi() const { |
124 return Wide(_mm_unpacklo_epi8(fVec, fVec), | 126 return Wide(_mm_unpacklo_epi8(fVec, fVec), |
125 _mm_unpackhi_epi8(fVec, fVec)); | 127 _mm_unpackhi_epi8(fVec, fVec)); |
126 } | 128 } |
127 | 129 |
128 SkPx_sse operator+(const SkPx_sse& o) const { return _mm_add_epi8(fVec, o
.fVec); } | 130 SkPx operator+(const SkPx& o) const { return _mm_add_epi8(fVec, o.fVec);
} |
129 SkPx_sse operator-(const SkPx_sse& o) const { return _mm_sub_epi8(fVec, o
.fVec); } | 131 SkPx operator-(const SkPx& o) const { return _mm_sub_epi8(fVec, o.fVec);
} |
130 SkPx_sse saturatedAdd(const SkPx_sse& o) const { return _mm_adds_epi8(fVec,
o.fVec); } | 132 SkPx saturatedAdd(const SkPx& o) const { return _mm_adds_epi8(fVec, o.fVec);
} |
131 | 133 |
132 Wide operator*(const Alpha& a) const { | 134 Wide operator*(const Alpha& a) const { |
133 __m128i pLo = _mm_unpacklo_epi8( fVec, _mm_setzero_si128()), | 135 __m128i pLo = _mm_unpacklo_epi8( fVec, _mm_setzero_si128()), |
134 aLo = _mm_unpacklo_epi8(a.fVec, _mm_setzero_si128()), | 136 aLo = _mm_unpacklo_epi8(a.fVec, _mm_setzero_si128()), |
135 pHi = _mm_unpackhi_epi8( fVec, _mm_setzero_si128()), | 137 pHi = _mm_unpackhi_epi8( fVec, _mm_setzero_si128()), |
136 aHi = _mm_unpackhi_epi8(a.fVec, _mm_setzero_si128()); | 138 aHi = _mm_unpackhi_epi8(a.fVec, _mm_setzero_si128()); |
137 return Wide(_mm_mullo_epi16(pLo, aLo), _mm_mullo_epi16(pHi, aHi)); | 139 return Wide(_mm_mullo_epi16(pLo, aLo), _mm_mullo_epi16(pHi, aHi)); |
138 } | 140 } |
139 SkPx_sse approxMulDiv255(const Alpha& a) const { | 141 SkPx approxMulDiv255(const Alpha& a) const { |
140 return (*this * a).addNarrowHi(*this); | 142 return (*this * a).addNarrowHi(*this); |
141 } | 143 } |
142 | 144 |
143 SkPx_sse addAlpha(const Alpha& a) const { | 145 SkPx addAlpha(const Alpha& a) const { |
144 return _mm_add_epi8(fVec, _mm_and_si128(a.fVec, _mm_set1_epi32(0xFF00000
0))); | 146 return _mm_add_epi8(fVec, _mm_and_si128(a.fVec, _mm_set1_epi32(0xFF00000
0))); |
145 } | 147 } |
146 }; | 148 }; |
147 | 149 |
148 typedef SkPx_sse SkPx; | 150 } // namespace sse |
| 151 |
| 152 typedef sse::SkPx SkPx; |
149 | 153 |
150 #endif//SkPx_sse_DEFINED | 154 #endif//SkPx_sse_DEFINED |
OLD | NEW |