Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(305)

Side by Side Diff: src/opts/SkNx_sse.h

Issue 2197683002: SkNx: add Sk4u (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkNx_neon.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2015 Google Inc. 2 * Copyright 2015 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkNx_sse_DEFINED 8 #ifndef SkNx_sse_DEFINED
9 #define SkNx_sse_DEFINED 9 #define SkNx_sse_DEFINED
10 10
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 #else 123 #else
124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
125 _mm_andnot_ps(fVec, e.fVec)); 125 _mm_andnot_ps(fVec, e.fVec));
126 #endif 126 #endif
127 } 127 }
128 128
129 __m128 fVec; 129 __m128 fVec;
130 }; 130 };
131 131
132 template <> 132 template <>
133 class SkNx<4, int> { 133 class SkNx<4, int32_t> {
134 public: 134 public:
135 SkNx(const __m128i& vec) : fVec(vec) {} 135 SkNx(const __m128i& vec) : fVec(vec) {}
136 136
137 SkNx() {} 137 SkNx() {}
138 SkNx(int val) : fVec(_mm_set1_epi32(val)) {} 138 SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {}
139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); } 139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }
140 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} 140 SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c ,d)) {}
141 141
142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
143 143
144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } 144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } 145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
146 SkNx operator * (const SkNx& o) const { 146 SkNx operator * (const SkNx& o) const {
147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), 147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o. fVec, 4)); 148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o. fVec, 4));
149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)) , 149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)) ,
150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)) ); 150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)) );
151 } 151 }
152 152
153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } 153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
154 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } 154 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } 155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
156 156
157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } 157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } 158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
159 159
160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); } 160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); }
161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe c); } 161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe c); }
162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe c); } 162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe c); }
163 163
164 int operator[](int k) const { 164 int32_t operator[](int k) const {
165 SkASSERT(0 <= k && k < 4); 165 SkASSERT(0 <= k && k < 4);
166 union { __m128i v; int is[4]; } pun = {fVec}; 166 union { __m128i v; int32_t is[4]; } pun = {fVec};
167 return pun.is[k&3]; 167 return pun.is[k&3];
168 } 168 }
169 169
170 SkNx thenElse(const SkNx& t, const SkNx& e) const { 170 SkNx thenElse(const SkNx& t, const SkNx& e) const {
171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); 172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
173 #else 173 #else
174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
175 _mm_andnot_si128(fVec, e.fVec)); 175 _mm_andnot_si128(fVec, e.fVec));
176 #endif 176 #endif
177 } 177 }
178 178
179 __m128i fVec; 179 __m128i fVec;
180 }; 180 };
181 181
182 template <> 182 template <>
183 class SkNx<4, uint32_t> {
184 public:
185 SkNx(const __m128i& vec) : fVec(vec) {}
186
187 SkNx() {}
188 SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {}
189 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }
190 SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a ,b,c,d)) {}
191
192 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
193
194 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
195 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
196 // Not quite sure how to best do operator * in SSE2. We probably don't use it.
197
198 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
199 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
200 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
201
202 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
203 SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); }
204
205 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); }
206 // operator < and > take a little extra fiddling to make work for unsigned i nts.
207
208 uint32_t operator[](int k) const {
209 SkASSERT(0 <= k && k < 4);
210 union { __m128i v; uint32_t us[4]; } pun = {fVec};
211 return pun.us[k&3];
212 }
213
214 SkNx thenElse(const SkNx& t, const SkNx& e) const {
215 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
216 return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
217 #else
218 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
219 _mm_andnot_si128(fVec, e.fVec));
220 #endif
221 }
222
223 __m128i fVec;
224 };
225
226
227 template <>
183 class SkNx<4, uint16_t> { 228 class SkNx<4, uint16_t> {
184 public: 229 public:
185 SkNx(const __m128i& vec) : fVec(vec) {} 230 SkNx(const __m128i& vec) : fVec(vec) {}
186 231
187 SkNx() {} 232 SkNx() {}
188 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 233 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
189 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p tr); } 234 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p tr); }
190 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a ,b,c,d,0,0,0,0)) {} 235 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a ,b,c,d,0,0,0,0)) {}
191 236
192 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } 237 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
308 } 353 }
309 354
310 SkNx thenElse(const SkNx& t, const SkNx& e) const { 355 SkNx thenElse(const SkNx& t, const SkNx& e) const {
311 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 356 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
312 _mm_andnot_si128(fVec, e.fVec)); 357 _mm_andnot_si128(fVec, e.fVec));
313 } 358 }
314 359
315 __m128i fVec; 360 __m128i fVec;
316 }; 361 };
317 362
318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { 363 template<> /*static*/ inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
319 return _mm_cvtepi32_ps(src.fVec); 364 return _mm_cvtepi32_ps(src.fVec);
320 } 365 }
366 template<> /*static*/ inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
367 return SkNx_cast<float>(Sk4i::Load(&src));
368 }
321 369
322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { 370 template <> /*static*/ inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
323 return _mm_cvttps_epi32(src.fVec); 371 return _mm_cvttps_epi32(src.fVec);
324 } 372 }
325 373
326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { 374 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 375 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
328 // TODO: This seems to be causing code generation problems. Investigate? 376 // TODO: This seems to be causing code generation problems. Investigate?
329 return _mm_packus_epi32(src.fVec); 377 return _mm_packus_epi32(src.fVec);
330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 378 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 379 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
332 const int _ = ~0; 380 const int _ = ~0;
333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_)); 381 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_));
334 #else 382 #else
335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want. 383 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want.
336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); 384 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
337 return _mm_packs_epi32(x,x); 385 return _mm_packs_epi32(x,x);
338 #endif 386 #endif
339 } 387 }
340 388
341 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 389 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src)); 390 return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src));
343 } 391 }
344 392
345 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 393 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
346 auto _32 = _mm_cvttps_epi32(src.fVec); 394 auto _32 = _mm_cvttps_epi32(src.fVec);
347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 395 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
348 const int _ = ~0; 396 const int _ = ~0;
349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_)); 397 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));
350 #else 398 #else
351 auto _16 = _mm_packus_epi16(_32, _32); 399 auto _16 = _mm_packus_epi16(_32, _32);
352 return _mm_packus_epi16(_16, _16); 400 return _mm_packus_epi16(_16, _16);
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
384 } 432 }
385 433
386 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { 434 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
387 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 435 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
388 } 436 }
389 437
390 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 438 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
391 return _mm_packus_epi16(src.fVec, src.fVec); 439 return _mm_packus_epi16(src.fVec, src.fVec);
392 } 440 }
393 441
394 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { 442 template<> /*static*/ inline Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 443 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
396 } 444 }
397 445
398 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { 446 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); 447 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
400 } 448 }
401 449
402 static inline Sk4i Sk4f_round(const Sk4f& x) { 450 static inline Sk4i Sk4f_round(const Sk4f& x) {
403 return _mm_cvtps_epi32(x.fVec); 451 return _mm_cvtps_epi32(x.fVec);
404 } 452 }
405 453
406 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) { 454 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
407 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), 455 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
408 hi = _mm_loadu_si128(((__m128i*)ptr) + 1); 456 hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
(...skipping 11 matching lines...) Expand all
420 const Sk4h& a) { 468 const Sk4h& a) {
421 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); 469 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
422 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); 470 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
423 __m128i lo = _mm_unpacklo_epi32(rg, ba); 471 __m128i lo = _mm_unpacklo_epi32(rg, ba);
424 __m128i hi = _mm_unpackhi_epi32(rg, ba); 472 __m128i hi = _mm_unpackhi_epi32(rg, ba);
425 _mm_storeu_si128(((__m128i*) dst) + 0, lo); 473 _mm_storeu_si128(((__m128i*) dst) + 0, lo);
426 _mm_storeu_si128(((__m128i*) dst) + 1, hi); 474 _mm_storeu_si128(((__m128i*) dst) + 1, hi);
427 } 475 }
428 476
429 #endif//SkNx_sse_DEFINED 477 #endif//SkNx_sse_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkNx_neon.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698