Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(17)

Side by Side Diff: src/opts/SkNx_sse.h

Issue 1683543002: sknx refactoring (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: typos Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkNx_neon.h ('k') | src/opts/SkXfermode_opts.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2015 Google Inc. 2 * Copyright 2015 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkNx_sse_DEFINED 8 #ifndef SkNx_sse_DEFINED
9 #define SkNx_sse_DEFINED 9 #define SkNx_sse_DEFINED
10 10
11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo re recent. 11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo re recent.
12 // If you do, make sure this is in a static inline function... anywhere else ris ks violating ODR.
12 13
13 #define SKNX_IS_FAST 14 #define SKNX_IS_FAST
14 15
15 namespace { // See SkNx.h
16
17
18 template <> 16 template <>
19 class SkNx<2, float> { 17 class SkNx<2, float> {
20 public: 18 public:
21 SkNx(const __m128& vec) : fVec(vec) {} 19 SkNx(const __m128& vec) : fVec(vec) {}
22 20
23 SkNx() {} 21 SkNx() {}
24 SkNx(float val) : fVec(_mm_set1_ps(val)) {} 22 SkNx(float val) : fVec(_mm_set1_ps(val)) {}
25 static SkNx Load(const void* ptr) { 23 static SkNx Load(const void* ptr) {
26 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); 24 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr));
27 } 25 }
28 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} 26 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
29 27
30 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } 28 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); }
31 29
32 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 30 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
33 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 31 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
34 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 32 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
35 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 33 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
36 34
37 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 35 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
38 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 36 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
39 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 37 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
40 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 38 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
41 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 39 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
42 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 40 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
43 41
44 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r. fVec); } 42 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r. fVec); }
45 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r. fVec); } 43 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r. fVec); }
46 44
47 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } 45 SkNx sqrt () const { return _mm_sqrt_ps (fVec); }
48 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 46 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
49 SkNx rsqrt1() const { return this->rsqrt0(); } 47 SkNx rsqrt1() const { return this->rsqrt0(); }
50 SkNx rsqrt2() const { return this->rsqrt1(); } 48 SkNx rsqrt2() const { return this->rsqrt1(); }
51 49
52 SkNx invert() const { return SkNx(1) / *this; } 50 SkNx invert() const { return SkNx(1) / *this; }
53 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 51 SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
54 52
55 template <int k> float kth() const { 53 float operator[](int k) const {
56 SkASSERT(0 <= k && k < 2); 54 SkASSERT(0 <= k && k < 2);
57 union { __m128 v; float fs[4]; } pun = {fVec}; 55 union { __m128 v; float fs[4]; } pun = {fVec};
58 return pun.fs[k&1]; 56 return pun.fs[k&1];
59 } 57 }
58 template <int k> float kth() const { return (*this)[k]; }
60 59
61 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV ec)) & 0xff); } 60 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV ec)) & 0xff); }
62 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV ec)) & 0xff); } 61 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV ec)) & 0xff); }
63 62
64 __m128 fVec; 63 __m128 fVec;
65 }; 64 };
66 65
67 template <> 66 template <>
68 class SkNx<2, double> {
69 public:
70 SkNx(const __m128d& vec) : fVec(vec) {}
71
72 SkNx() {}
73 SkNx(double val) : fVec(_mm_set1_pd(val)) {}
74 static SkNx Load(const void* ptr) { return _mm_loadu_pd((const double*)ptr); }
75 SkNx(double a, double b) : fVec(_mm_setr_pd(a,b)) {}
76
77 void store(void* ptr) const { _mm_storeu_pd((double*)ptr, fVec); }
78
79 SkNx operator + (const SkNx& o) const { return _mm_add_pd(fVec, o.fVec); }
80 SkNx operator - (const SkNx& o) const { return _mm_sub_pd(fVec, o.fVec); }
81 SkNx operator * (const SkNx& o) const { return _mm_mul_pd(fVec, o.fVec); }
82 SkNx operator / (const SkNx& o) const { return _mm_div_pd(fVec, o.fVec); }
83
84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_pd (fVec, o.fVec); }
85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_pd(fVec, o.fVec); }
86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_pd (fVec, o.fVec); }
87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_pd (fVec, o.fVec); }
88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_pd (fVec, o.fVec); }
89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_pd (fVec, o.fVec); }
90
91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_pd(l.fVec, r. fVec); }
92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_pd(l.fVec, r. fVec); }
93
94 SkNx sqrt() const { return _mm_sqrt_pd(fVec); }
95
96 template <int k> double kth() const {
97 SkASSERT(0 <= k && k < 2);
98 union { __m128d v; double fs[2]; } pun = {fVec};
99 return pun.fs[k&1];
100 }
101
102 bool allTrue() const { return 0x3 == _mm_movemask_pd(fVec); }
103 bool anyTrue() const { return 0x0 != _mm_movemask_pd(fVec); }
104
105 SkNx thenElse(const SkNx& t, const SkNx& e) const {
106 return _mm_or_pd(_mm_and_pd (fVec, t.fVec),
107 _mm_andnot_pd(fVec, e.fVec));
108 }
109
110 __m128d fVec;
111 };
112
113 template <>
114 class SkNx<4, int> {
115 public:
116 SkNx(const __m128i& vec) : fVec(vec) {}
117
118 SkNx() {}
119 SkNx(int val) : fVec(_mm_set1_epi32(val)) {}
120 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }
121 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
122
123 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
124
125 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
126 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
127 SkNx operator * (const SkNx& o) const {
128 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
129 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o. fVec, 4));
130 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)) ,
131 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)) );
132 }
133
134 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
135 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
136
137 template <int k> int kth() const {
138 SkASSERT(0 <= k && k < 4);
139 switch (k) {
140 case 0: return _mm_cvtsi128_si32(fVec);
141 case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4));
142 case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8));
143 case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12));
144 default: SkASSERT(false); return 0;
145 }
146 }
147
148 __m128i fVec;
149 };
150
151 template <>
152 class SkNx<4, float> { 67 class SkNx<4, float> {
153 public: 68 public:
154 SkNx(const __m128& vec) : fVec(vec) {} 69 SkNx(const __m128& vec) : fVec(vec) {}
155 70
156 SkNx() {} 71 SkNx() {}
157 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} 72 SkNx(float val) : fVec( _mm_set1_ps(val) ) {}
158 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } 73 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
159 74
160 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} 75 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
161 76
162 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } 77 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
163 78
164 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 79 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
165 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 80 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
166 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 81 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
167 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 82 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
168 83
169 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
170 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
171 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
172 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
173 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
174 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
175 90
176 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r. fVec); } 91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r. fVec); }
177 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r. fVec); } 92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r. fVec); }
178 93
179 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } 94 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
180 95
181 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } 96 SkNx sqrt () const { return _mm_sqrt_ps (fVec); }
182 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 97 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
183 SkNx rsqrt1() const { return this->rsqrt0(); } 98 SkNx rsqrt1() const { return this->rsqrt0(); }
184 SkNx rsqrt2() const { return this->rsqrt1(); } 99 SkNx rsqrt2() const { return this->rsqrt1(); }
185 100
186 SkNx invert() const { return SkNx(1) / *this; } 101 SkNx invert() const { return SkNx(1) / *this; }
187 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 102 SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
188 103
189 template <int k> float kth() const { 104 float operator[](int k) const {
190 SkASSERT(0 <= k && k < 4); 105 SkASSERT(0 <= k && k < 4);
191 union { __m128 v; float fs[4]; } pun = {fVec}; 106 union { __m128 v; float fs[4]; } pun = {fVec};
192 return pun.fs[k&3]; 107 return pun.fs[k&3];
193 } 108 }
109 template <int k> float kth() const { return (*this)[k]; }
194 110
195 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f Vec)); } 111 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f Vec)); }
196 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f Vec)); } 112 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f Vec)); }
197 113
198 SkNx thenElse(const SkNx& t, const SkNx& e) const { 114 SkNx thenElse(const SkNx& t, const SkNx& e) const {
199 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 115 return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
200 _mm_andnot_ps(fVec, e.fVec)); 116 _mm_andnot_ps(fVec, e.fVec));
201 } 117 }
202 118
203 __m128 fVec; 119 __m128 fVec;
(...skipping 11 matching lines...) Expand all
215 131
216 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } 132 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
217 133
218 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 134 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
219 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 135 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
220 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec) ; } 136 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec) ; }
221 137
222 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 138 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
223 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 139 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
224 140
225 template <int k> uint16_t kth() const { 141 uint16_t operator[](int k) const {
226 SkASSERT(0 <= k && k < 4); 142 SkASSERT(0 <= k && k < 4);
227 return _mm_extract_epi16(fVec, k); 143 union { __m128i v; uint16_t us[8]; } pun = {fVec};
144 return pun.us[k&3];
228 } 145 }
146 template <int k> uint16_t kth() const { return (*this)[k]; }
229 147
230 __m128i fVec; 148 __m128i fVec;
231 }; 149 };
232 150
233 template <> 151 template <>
234 class SkNx<8, uint16_t> { 152 class SkNx<8, uint16_t> {
235 public: 153 public:
236 SkNx(const __m128i& vec) : fVec(vec) {} 154 SkNx(const __m128i& vec) : fVec(vec) {}
237 155
238 SkNx() {} 156 SkNx() {}
(...skipping 18 matching lines...) Expand all
257 const __m128i top_8x = _mm_set1_epi16(top); 175 const __m128i top_8x = _mm_set1_epi16(top);
258 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), 176 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x),
259 _mm_sub_epi8(b.fVec, top_8x))) ; 177 _mm_sub_epi8(b.fVec, top_8x))) ;
260 } 178 }
261 179
262 SkNx thenElse(const SkNx& t, const SkNx& e) const { 180 SkNx thenElse(const SkNx& t, const SkNx& e) const {
263 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 181 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
264 _mm_andnot_si128(fVec, e.fVec)); 182 _mm_andnot_si128(fVec, e.fVec));
265 } 183 }
266 184
267 template <int k> uint16_t kth() const { 185 uint16_t operator[](int k) const {
268 SkASSERT(0 <= k && k < 8); 186 SkASSERT(0 <= k && k < 8);
269 return _mm_extract_epi16(fVec, k); 187 union { __m128i v; uint16_t us[8]; } pun = {fVec};
188 return pun.us[k&7];
270 } 189 }
190 template <int k> uint16_t kth() const { return (*this)[k]; }
271 191
272 __m128i fVec; 192 __m128i fVec;
273 }; 193 };
274 194
275 template <> 195 template <>
276 class SkNx<4, uint8_t> { 196 class SkNx<4, uint8_t> {
277 public: 197 public:
278 SkNx(const __m128i& vec) : fVec(vec) {} 198 SkNx(const __m128i& vec) : fVec(vec) {}
279 199
280 SkNx() {} 200 SkNx() {}
281 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt r); } 201 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt r); }
282 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } 202 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
283 203
284 // TODO as needed 204 // TODO as needed
285 205
286 __m128i fVec; 206 __m128i fVec;
287 }; 207 };
288 208
289 template <> 209 template <>
290 class SkNx<8, uint8_t> {
291 public:
292 SkNx(const __m128i& vec) : fVec(vec) {}
293
294 SkNx() {}
295 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p tr); }
296 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
297
298 // TODO as needed
299
300 __m128i fVec;
301 };
302
303 template <>
304 class SkNx<16, uint8_t> { 210 class SkNx<16, uint8_t> {
305 public: 211 public:
306 SkNx(const __m128i& vec) : fVec(vec) {} 212 SkNx(const __m128i& vec) : fVec(vec) {}
307 213
308 SkNx() {} 214 SkNx() {}
309 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 215 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
310 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); } 216 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }
311 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 217 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
312 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 218 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
313 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 219 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
314 uint8_t m, uint8_t n, uint8_t o, uint8_t p) 220 uint8_t m, uint8_t n, uint8_t o, uint8_t p)
315 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} 221 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {}
316 222
317 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 223 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
318 224
319 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } 225 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
320 226
321 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } 227 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
322 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } 228 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
323 229
324 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } 230 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
325 SkNx operator < (const SkNx& o) const { 231 SkNx operator < (const SkNx& o) const {
326 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. 232 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
327 auto flip = _mm_set1_epi8(char(0x80)); 233 auto flip = _mm_set1_epi8(char(0x80));
328 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f Vec)); 234 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f Vec));
329 } 235 }
330 236
331 template <int k> uint8_t kth() const { 237 uint8_t operator[](int k) const {
332 SkASSERT(0 <= k && k < 16); 238 SkASSERT(0 <= k && k < 16);
333 // SSE4.1 would just `return _mm_extract_epi8(fVec, k)`. We have to rea d 16-bits instead. 239 union { __m128i v; uint8_t us[16]; } pun = {fVec};
334 int pair = _mm_extract_epi16(fVec, k/2); 240 return pun.us[k&15];
335 return k % 2 == 0 ? pair : (pair >> 8);
336 } 241 }
242 template <int k> uint8_t kth() const { return (*this)[k]; }
337 243
338 SkNx thenElse(const SkNx& t, const SkNx& e) const { 244 SkNx thenElse(const SkNx& t, const SkNx& e) const {
339 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 245 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
340 _mm_andnot_si128(fVec, e.fVec)); 246 _mm_andnot_si128(fVec, e.fVec));
341 } 247 }
342 248
343 __m128i fVec; 249 __m128i fVec;
344 }; 250 };
345 251
346 252
347 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { 253 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
348 return _mm_cvttps_epi32(src.fVec);
349 }
350
351 template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) {
352 auto _32 = _mm_cvttps_epi32(src.fVec); 254 auto _32 = _mm_cvttps_epi32(src.fVec);
353 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. 255 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+.
354 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 256 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
355 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 257 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
356 const int _ = ~0; 258 const int _ = ~0;
357 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_)); 259 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_));
358 #else 260 #else
359 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: 261 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:
360 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); 262 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));
361 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 )); 263 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 ));
362 #endif 264 #endif
363 } 265 }
364 266
365 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { 267 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
366 auto _32 = _mm_cvttps_epi32(src.fVec); 268 auto _32 = _mm_cvttps_epi32(src.fVec);
367 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 269 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
368 const int _ = ~0; 270 const int _ = ~0;
369 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_)); 271 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));
370 #else 272 #else
371 auto _16 = _mm_packus_epi16(_32, _32); 273 auto _16 = _mm_packus_epi16(_32, _32);
372 return _mm_packus_epi16(_16, _16); 274 return _mm_packus_epi16(_16, _16);
373 #endif 275 #endif
374 } 276 }
375 277
376 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { 278 template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
377 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 279 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
378 const int _ = ~0; 280 const int _ = ~0;
379 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_, _, 3,_,_,_)); 281 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_, _, 3,_,_,_));
380 #else 282 #else
381 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), 283 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),
382 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); 284 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128());
383 #endif 285 #endif
384 return _mm_cvtepi32_ps(_32); 286 return _mm_cvtepi32_ps(_32);
385 } 287 }
386 288
387 template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) { 289 template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
388 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 290 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
389 return _mm_cvtepi32_ps(_32); 291 return _mm_cvtepi32_ps(_32);
390 } 292 }
391 293
392 static inline void Sk4f_ToBytes(uint8_t bytes[16], 294 static inline void Sk4f_ToBytes(uint8_t bytes[16],
393 const Sk4f& a, const Sk4f& b, const Sk4f& c, con st Sk4f& d) { 295 const Sk4f& a, const Sk4f& b, const Sk4f& c, con st Sk4f& d) {
394 _mm_storeu_si128((__m128i*)bytes, 296 _mm_storeu_si128((__m128i*)bytes,
395 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), 297 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
396 _mm_cvttps_epi32(b.fVec)) , 298 _mm_cvttps_epi32(b.fVec)) ,
397 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), 299 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
398 _mm_cvttps_epi32(d.fVec)) )); 300 _mm_cvttps_epi32(d.fVec)) ));
399 } 301 }
400 302
401 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) { 303 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
402 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 304 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
403 } 305 }
404 306
405 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { 307 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
406 return _mm_packus_epi16(src.fVec, src.fVec); 308 return _mm_packus_epi16(src.fVec, src.fVec);
407 } 309 }
408 310
409
410 } // namespace
411
412 #endif//SkNx_sse_DEFINED 311 #endif//SkNx_sse_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkNx_neon.h ('k') | src/opts/SkXfermode_opts.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698