OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
10 | 10 |
11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo
re recent. | 11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo
re recent. |
| 12 // If you do, make sure this is in a static inline function... anywhere else ris
ks violating ODR. |
12 | 13 |
13 #define SKNX_IS_FAST | 14 #define SKNX_IS_FAST |
14 | 15 |
15 namespace { // See SkNx.h | |
16 | |
17 | |
18 template <> | 16 template <> |
19 class SkNx<2, float> { | 17 class SkNx<2, float> { |
20 public: | 18 public: |
21 SkNx(const __m128& vec) : fVec(vec) {} | 19 SkNx(const __m128& vec) : fVec(vec) {} |
22 | 20 |
23 SkNx() {} | 21 SkNx() {} |
24 SkNx(float val) : fVec(_mm_set1_ps(val)) {} | 22 SkNx(float val) : fVec(_mm_set1_ps(val)) {} |
25 static SkNx Load(const void* ptr) { | 23 static SkNx Load(const void* ptr) { |
26 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); | 24 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); |
27 } | 25 } |
28 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} | 26 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} |
29 | 27 |
30 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } | 28 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } |
31 | 29 |
32 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 30 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
33 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 31 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
34 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 32 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
35 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 33 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
36 | 34 |
37 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 35 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
38 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 36 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
39 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 37 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
40 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 38 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
41 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} | 39 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} |
42 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} | 40 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} |
43 | 41 |
44 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } | 42 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } |
45 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } | 43 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } |
46 | 44 |
47 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } | 45 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } |
48 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } | 46 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } |
49 SkNx rsqrt1() const { return this->rsqrt0(); } | 47 SkNx rsqrt1() const { return this->rsqrt0(); } |
50 SkNx rsqrt2() const { return this->rsqrt1(); } | 48 SkNx rsqrt2() const { return this->rsqrt1(); } |
51 | 49 |
52 SkNx invert() const { return SkNx(1) / *this; } | 50 SkNx invert() const { return SkNx(1) / *this; } |
53 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } | 51 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } |
54 | 52 |
55 template <int k> float kth() const { | 53 float operator[](int k) const { |
56 SkASSERT(0 <= k && k < 2); | 54 SkASSERT(0 <= k && k < 2); |
57 union { __m128 v; float fs[4]; } pun = {fVec}; | 55 union { __m128 v; float fs[4]; } pun = {fVec}; |
58 return pun.fs[k&1]; | 56 return pun.fs[k&1]; |
59 } | 57 } |
| 58 template <int k> float kth() const { return (*this)[k]; } |
60 | 59 |
61 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } | 60 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } |
62 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } | 61 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } |
63 | 62 |
64 __m128 fVec; | 63 __m128 fVec; |
65 }; | 64 }; |
66 | 65 |
67 template <> | 66 template <> |
68 class SkNx<2, double> { | |
69 public: | |
70 SkNx(const __m128d& vec) : fVec(vec) {} | |
71 | |
72 SkNx() {} | |
73 SkNx(double val) : fVec(_mm_set1_pd(val)) {} | |
74 static SkNx Load(const void* ptr) { return _mm_loadu_pd((const double*)ptr);
} | |
75 SkNx(double a, double b) : fVec(_mm_setr_pd(a,b)) {} | |
76 | |
77 void store(void* ptr) const { _mm_storeu_pd((double*)ptr, fVec); } | |
78 | |
79 SkNx operator + (const SkNx& o) const { return _mm_add_pd(fVec, o.fVec); } | |
80 SkNx operator - (const SkNx& o) const { return _mm_sub_pd(fVec, o.fVec); } | |
81 SkNx operator * (const SkNx& o) const { return _mm_mul_pd(fVec, o.fVec); } | |
82 SkNx operator / (const SkNx& o) const { return _mm_div_pd(fVec, o.fVec); } | |
83 | |
84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_pd (fVec, o.fVec);
} | |
85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_pd(fVec, o.fVec);
} | |
86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_pd (fVec, o.fVec);
} | |
87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_pd (fVec, o.fVec);
} | |
88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_pd (fVec, o.fVec);
} | |
89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_pd (fVec, o.fVec);
} | |
90 | |
91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_pd(l.fVec, r.
fVec); } | |
92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_pd(l.fVec, r.
fVec); } | |
93 | |
94 SkNx sqrt() const { return _mm_sqrt_pd(fVec); } | |
95 | |
96 template <int k> double kth() const { | |
97 SkASSERT(0 <= k && k < 2); | |
98 union { __m128d v; double fs[2]; } pun = {fVec}; | |
99 return pun.fs[k&1]; | |
100 } | |
101 | |
102 bool allTrue() const { return 0x3 == _mm_movemask_pd(fVec); } | |
103 bool anyTrue() const { return 0x0 != _mm_movemask_pd(fVec); } | |
104 | |
105 SkNx thenElse(const SkNx& t, const SkNx& e) const { | |
106 return _mm_or_pd(_mm_and_pd (fVec, t.fVec), | |
107 _mm_andnot_pd(fVec, e.fVec)); | |
108 } | |
109 | |
110 __m128d fVec; | |
111 }; | |
112 | |
113 template <> | |
114 class SkNx<4, int> { | |
115 public: | |
116 SkNx(const __m128i& vec) : fVec(vec) {} | |
117 | |
118 SkNx() {} | |
119 SkNx(int val) : fVec(_mm_set1_epi32(val)) {} | |
120 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } | |
121 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} | |
122 | |
123 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } | |
124 | |
125 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec);
} | |
126 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec);
} | |
127 SkNx operator * (const SkNx& o) const { | |
128 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), | |
129 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.
fVec, 4)); | |
130 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0))
, | |
131 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))
); | |
132 } | |
133 | |
134 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } | |
135 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } | |
136 | |
137 template <int k> int kth() const { | |
138 SkASSERT(0 <= k && k < 4); | |
139 switch (k) { | |
140 case 0: return _mm_cvtsi128_si32(fVec); | |
141 case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4)); | |
142 case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8)); | |
143 case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12)); | |
144 default: SkASSERT(false); return 0; | |
145 } | |
146 } | |
147 | |
148 __m128i fVec; | |
149 }; | |
150 | |
151 template <> | |
152 class SkNx<4, float> { | 67 class SkNx<4, float> { |
153 public: | 68 public: |
154 SkNx(const __m128& vec) : fVec(vec) {} | 69 SkNx(const __m128& vec) : fVec(vec) {} |
155 | 70 |
156 SkNx() {} | 71 SkNx() {} |
157 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} | 72 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} |
158 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr);
} | 73 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr);
} |
159 | 74 |
160 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 75 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} |
161 | 76 |
162 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } | 77 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } |
163 | 78 |
164 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 79 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
165 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 80 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
166 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 81 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
167 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 82 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
168 | 83 |
169 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
170 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
171 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
172 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
173 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} | 88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} |
174 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} | 89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} |
175 | 90 |
176 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } | 91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } |
177 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } | 92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } |
178 | 93 |
179 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } | 94 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } |
180 | 95 |
181 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } | 96 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } |
182 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } | 97 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } |
183 SkNx rsqrt1() const { return this->rsqrt0(); } | 98 SkNx rsqrt1() const { return this->rsqrt0(); } |
184 SkNx rsqrt2() const { return this->rsqrt1(); } | 99 SkNx rsqrt2() const { return this->rsqrt1(); } |
185 | 100 |
186 SkNx invert() const { return SkNx(1) / *this; } | 101 SkNx invert() const { return SkNx(1) / *this; } |
187 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } | 102 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } |
188 | 103 |
189 template <int k> float kth() const { | 104 float operator[](int k) const { |
190 SkASSERT(0 <= k && k < 4); | 105 SkASSERT(0 <= k && k < 4); |
191 union { __m128 v; float fs[4]; } pun = {fVec}; | 106 union { __m128 v; float fs[4]; } pun = {fVec}; |
192 return pun.fs[k&3]; | 107 return pun.fs[k&3]; |
193 } | 108 } |
| 109 template <int k> float kth() const { return (*this)[k]; } |
194 | 110 |
195 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } | 111 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } |
196 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } | 112 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } |
197 | 113 |
198 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 114 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
199 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), | 115 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), |
200 _mm_andnot_ps(fVec, e.fVec)); | 116 _mm_andnot_ps(fVec, e.fVec)); |
201 } | 117 } |
202 | 118 |
203 __m128 fVec; | 119 __m128 fVec; |
(...skipping 11 matching lines...) Expand all Loading... |
215 | 131 |
216 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } | 132 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } |
217 | 133 |
218 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec);
} | 134 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec);
} |
219 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec);
} | 135 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec);
} |
220 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec)
; } | 136 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec)
; } |
221 | 137 |
222 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } | 138 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } |
223 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } | 139 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } |
224 | 140 |
225 template <int k> uint16_t kth() const { | 141 uint16_t operator[](int k) const { |
226 SkASSERT(0 <= k && k < 4); | 142 SkASSERT(0 <= k && k < 4); |
227 return _mm_extract_epi16(fVec, k); | 143 union { __m128i v; uint16_t us[8]; } pun = {fVec}; |
| 144 return pun.us[k&3]; |
228 } | 145 } |
| 146 template <int k> uint16_t kth() const { return (*this)[k]; } |
229 | 147 |
230 __m128i fVec; | 148 __m128i fVec; |
231 }; | 149 }; |
232 | 150 |
233 template <> | 151 template <> |
234 class SkNx<8, uint16_t> { | 152 class SkNx<8, uint16_t> { |
235 public: | 153 public: |
236 SkNx(const __m128i& vec) : fVec(vec) {} | 154 SkNx(const __m128i& vec) : fVec(vec) {} |
237 | 155 |
238 SkNx() {} | 156 SkNx() {} |
(...skipping 18 matching lines...) Expand all Loading... |
257 const __m128i top_8x = _mm_set1_epi16(top); | 175 const __m128i top_8x = _mm_set1_epi16(top); |
258 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), | 176 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), |
259 _mm_sub_epi8(b.fVec, top_8x)))
; | 177 _mm_sub_epi8(b.fVec, top_8x)))
; |
260 } | 178 } |
261 | 179 |
262 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 180 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
263 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 181 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
264 _mm_andnot_si128(fVec, e.fVec)); | 182 _mm_andnot_si128(fVec, e.fVec)); |
265 } | 183 } |
266 | 184 |
267 template <int k> uint16_t kth() const { | 185 uint16_t operator[](int k) const { |
268 SkASSERT(0 <= k && k < 8); | 186 SkASSERT(0 <= k && k < 8); |
269 return _mm_extract_epi16(fVec, k); | 187 union { __m128i v; uint16_t us[8]; } pun = {fVec}; |
| 188 return pun.us[k&7]; |
270 } | 189 } |
| 190 template <int k> uint16_t kth() const { return (*this)[k]; } |
271 | 191 |
272 __m128i fVec; | 192 __m128i fVec; |
273 }; | 193 }; |
274 | 194 |
275 template <> | 195 template <> |
276 class SkNx<4, uint8_t> { | 196 class SkNx<4, uint8_t> { |
277 public: | 197 public: |
278 SkNx(const __m128i& vec) : fVec(vec) {} | 198 SkNx(const __m128i& vec) : fVec(vec) {} |
279 | 199 |
280 SkNx() {} | 200 SkNx() {} |
281 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt
r); } | 201 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt
r); } |
282 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } | 202 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } |
283 | 203 |
284 // TODO as needed | 204 // TODO as needed |
285 | 205 |
286 __m128i fVec; | 206 __m128i fVec; |
287 }; | 207 }; |
288 | 208 |
289 template <> | 209 template <> |
290 class SkNx<8, uint8_t> { | |
291 public: | |
292 SkNx(const __m128i& vec) : fVec(vec) {} | |
293 | |
294 SkNx() {} | |
295 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p
tr); } | |
296 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } | |
297 | |
298 // TODO as needed | |
299 | |
300 __m128i fVec; | |
301 }; | |
302 | |
303 template <> | |
304 class SkNx<16, uint8_t> { | 210 class SkNx<16, uint8_t> { |
305 public: | 211 public: |
306 SkNx(const __m128i& vec) : fVec(vec) {} | 212 SkNx(const __m128i& vec) : fVec(vec) {} |
307 | 213 |
308 SkNx() {} | 214 SkNx() {} |
309 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 215 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} |
310 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } | 216 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } |
311 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 217 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
312 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 218 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
313 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 219 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
314 uint8_t m, uint8_t n, uint8_t o, uint8_t p) | 220 uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
315 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} | 221 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} |
316 | 222 |
317 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } | 223 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } |
318 | 224 |
319 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec);
} | 225 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec);
} |
320 | 226 |
321 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } | 227 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } |
322 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } | 228 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } |
323 | 229 |
324 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec,
b.fVec); } | 230 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec,
b.fVec); } |
325 SkNx operator < (const SkNx& o) const { | 231 SkNx operator < (const SkNx& o) const { |
326 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use
a signed compare. | 232 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use
a signed compare. |
327 auto flip = _mm_set1_epi8(char(0x80)); | 233 auto flip = _mm_set1_epi8(char(0x80)); |
328 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f
Vec)); | 234 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f
Vec)); |
329 } | 235 } |
330 | 236 |
331 template <int k> uint8_t kth() const { | 237 uint8_t operator[](int k) const { |
332 SkASSERT(0 <= k && k < 16); | 238 SkASSERT(0 <= k && k < 16); |
333 // SSE4.1 would just `return _mm_extract_epi8(fVec, k)`. We have to rea
d 16-bits instead. | 239 union { __m128i v; uint8_t us[16]; } pun = {fVec}; |
334 int pair = _mm_extract_epi16(fVec, k/2); | 240 return pun.us[k&15]; |
335 return k % 2 == 0 ? pair : (pair >> 8); | |
336 } | 241 } |
| 242 template <int k> uint8_t kth() const { return (*this)[k]; } |
337 | 243 |
338 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 244 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
339 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 245 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
340 _mm_andnot_si128(fVec, e.fVec)); | 246 _mm_andnot_si128(fVec, e.fVec)); |
341 } | 247 } |
342 | 248 |
343 __m128i fVec; | 249 __m128i fVec; |
344 }; | 250 }; |
345 | 251 |
346 | 252 |
347 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 253 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
348 return _mm_cvttps_epi32(src.fVec); | |
349 } | |
350 | |
351 template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { | |
352 auto _32 = _mm_cvttps_epi32(src.fVec); | 254 auto _32 = _mm_cvttps_epi32(src.fVec); |
353 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. | 255 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. |
354 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 256 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
355 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. | 257 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
356 const int _ = ~0; | 258 const int _ = ~0; |
357 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); | 259 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); |
358 #else | 260 #else |
359 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: | 261 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: |
360 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); | 262 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); |
361 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); | 263 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); |
362 #endif | 264 #endif |
363 } | 265 } |
364 | 266 |
365 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | 267 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
366 auto _32 = _mm_cvttps_epi32(src.fVec); | 268 auto _32 = _mm_cvttps_epi32(src.fVec); |
367 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 269 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
368 const int _ = ~0; | 270 const int _ = ~0; |
369 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 271 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
370 #else | 272 #else |
371 auto _16 = _mm_packus_epi16(_32, _32); | 273 auto _16 = _mm_packus_epi16(_32, _32); |
372 return _mm_packus_epi16(_16, _16); | 274 return _mm_packus_epi16(_16, _16); |
373 #endif | 275 #endif |
374 } | 276 } |
375 | 277 |
376 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { | 278 template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { |
377 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 279 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
378 const int _ = ~0; | 280 const int _ = ~0; |
379 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); | 281 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); |
380 #else | 282 #else |
381 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), | 283 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), |
382 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); | 284 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); |
383 #endif | 285 #endif |
384 return _mm_cvtepi32_ps(_32); | 286 return _mm_cvtepi32_ps(_32); |
385 } | 287 } |
386 | 288 |
387 template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) { | 289 template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { |
388 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); | 290 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); |
389 return _mm_cvtepi32_ps(_32); | 291 return _mm_cvtepi32_ps(_32); |
390 } | 292 } |
391 | 293 |
392 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 294 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
393 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { | 295 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
394 _mm_storeu_si128((__m128i*)bytes, | 296 _mm_storeu_si128((__m128i*)bytes, |
395 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), | 297 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), |
396 _mm_cvttps_epi32(b.fVec))
, | 298 _mm_cvttps_epi32(b.fVec))
, |
397 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), | 299 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), |
398 _mm_cvttps_epi32(d.fVec))
)); | 300 _mm_cvttps_epi32(d.fVec))
)); |
399 } | 301 } |
400 | 302 |
401 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) { | 303 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src)
{ |
402 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); | 304 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); |
403 } | 305 } |
404 | 306 |
405 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 307 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ |
406 return _mm_packus_epi16(src.fVec, src.fVec); | 308 return _mm_packus_epi16(src.fVec, src.fVec); |
407 } | 309 } |
408 | 310 |
409 | |
410 } // namespace | |
411 | |
412 #endif//SkNx_sse_DEFINED | 311 #endif//SkNx_sse_DEFINED |
OLD | NEW |