| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
| 10 | 10 |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 123 #else | 123 #else |
| 124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), | 124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), |
| 125 _mm_andnot_ps(fVec, e.fVec)); | 125 _mm_andnot_ps(fVec, e.fVec)); |
| 126 #endif | 126 #endif |
| 127 } | 127 } |
| 128 | 128 |
| 129 __m128 fVec; | 129 __m128 fVec; |
| 130 }; | 130 }; |
| 131 | 131 |
| 132 template <> | 132 template <> |
| 133 class SkNx<4, int> { | 133 class SkNx<4, int32_t> { |
| 134 public: | 134 public: |
| 135 SkNx(const __m128i& vec) : fVec(vec) {} | 135 SkNx(const __m128i& vec) : fVec(vec) {} |
| 136 | 136 |
| 137 SkNx() {} | 137 SkNx() {} |
| 138 SkNx(int val) : fVec(_mm_set1_epi32(val)) {} | 138 SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {} |
| 139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } | 139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } |
| 140 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} | 140 SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c
,d)) {} |
| 141 | 141 |
| 142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } | 142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } |
| 143 | 143 |
| 144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec);
} | 144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec);
} |
| 145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec);
} | 145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec);
} |
| 146 SkNx operator * (const SkNx& o) const { | 146 SkNx operator * (const SkNx& o) const { |
| 147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), | 147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), |
| 148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.
fVec, 4)); | 148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.
fVec, 4)); |
| 149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0))
, | 149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0))
, |
| 150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))
); | 150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))
); |
| 151 } | 151 } |
| 152 | 152 |
| 153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec);
} | 153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec);
} |
| 154 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } | 154 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } |
| 155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec);
} | 155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec);
} |
| 156 | 156 |
| 157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } | 157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } |
| 158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } | 158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } |
| 159 | 159 |
| 160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe
c); } | 160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe
c); } |
| 161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe
c); } | 161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe
c); } |
| 162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe
c); } | 162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe
c); } |
| 163 | 163 |
| 164 int operator[](int k) const { | 164 int32_t operator[](int k) const { |
| 165 SkASSERT(0 <= k && k < 4); | 165 SkASSERT(0 <= k && k < 4); |
| 166 union { __m128i v; int is[4]; } pun = {fVec}; | 166 union { __m128i v; int32_t is[4]; } pun = {fVec}; |
| 167 return pun.is[k&3]; | 167 return pun.is[k&3]; |
| 168 } | 168 } |
| 169 | 169 |
| 170 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 170 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | 171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); | 172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); |
| 173 #else | 173 #else |
| 174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 175 _mm_andnot_si128(fVec, e.fVec)); | 175 _mm_andnot_si128(fVec, e.fVec)); |
| 176 #endif | 176 #endif |
| 177 } | 177 } |
| 178 | 178 |
| 179 __m128i fVec; | 179 __m128i fVec; |
| 180 }; | 180 }; |
| 181 | 181 |
| 182 template <> | 182 template <> |
| 183 class SkNx<4, uint32_t> { |
| 184 public: |
| 185 SkNx(const __m128i& vec) : fVec(vec) {} |
| 186 |
| 187 SkNx() {} |
| 188 SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {} |
| 189 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } |
| 190 SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a
,b,c,d)) {} |
| 191 |
| 192 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } |
| 193 |
| 194 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec);
} |
| 195 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec);
} |
| 196 // Not quite sure how to best do operator * in SSE2. We probably don't use
it. |
| 197 |
| 198 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec);
} |
| 199 SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } |
| 200 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec);
} |
| 201 |
| 202 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } |
| 203 SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); } |
| 204 |
| 205 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe
c); } |
| 206 // operator < and > take a little extra fiddling to make work for unsigned i
nts. |
| 207 |
| 208 uint32_t operator[](int k) const { |
| 209 SkASSERT(0 <= k && k < 4); |
| 210 union { __m128i v; uint32_t us[4]; } pun = {fVec}; |
| 211 return pun.us[k&3]; |
| 212 } |
| 213 |
| 214 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 215 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 216 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); |
| 217 #else |
| 218 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 219 _mm_andnot_si128(fVec, e.fVec)); |
| 220 #endif |
| 221 } |
| 222 |
| 223 __m128i fVec; |
| 224 }; |
| 225 |
| 226 |
| 227 template <> |
| 183 class SkNx<4, uint16_t> { | 228 class SkNx<4, uint16_t> { |
| 184 public: | 229 public: |
| 185 SkNx(const __m128i& vec) : fVec(vec) {} | 230 SkNx(const __m128i& vec) : fVec(vec) {} |
| 186 | 231 |
| 187 SkNx() {} | 232 SkNx() {} |
| 188 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} | 233 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} |
| 189 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p
tr); } | 234 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p
tr); } |
| 190 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a
,b,c,d,0,0,0,0)) {} | 235 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a
,b,c,d,0,0,0,0)) {} |
| 191 | 236 |
| 192 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } | 237 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 308 } | 353 } |
| 309 | 354 |
| 310 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 355 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 311 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 356 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 312 _mm_andnot_si128(fVec, e.fVec)); | 357 _mm_andnot_si128(fVec, e.fVec)); |
| 313 } | 358 } |
| 314 | 359 |
| 315 __m128i fVec; | 360 __m128i fVec; |
| 316 }; | 361 }; |
| 317 | 362 |
| 318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { | 363 template<> /*static*/ inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { |
| 319 return _mm_cvtepi32_ps(src.fVec); | 364 return _mm_cvtepi32_ps(src.fVec); |
| 320 } | 365 } |
| 366 template<> /*static*/ inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { |
| 367 return SkNx_cast<float>(Sk4i::Load(&src)); |
| 368 } |
| 321 | 369 |
| 322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { | 370 template <> /*static*/ inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { |
| 323 return _mm_cvttps_epi32(src.fVec); | 371 return _mm_cvttps_epi32(src.fVec); |
| 324 } | 372 } |
| 325 | 373 |
| 326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { | 374 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src)
{ |
| 327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | 375 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 328 // TODO: This seems to be causing code generation problems. Investigate? | 376 // TODO: This seems to be causing code generation problems. Investigate? |
| 329 return _mm_packus_epi32(src.fVec); | 377 return _mm_packus_epi32(src.fVec); |
| 330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 378 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. | 379 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
| 332 const int _ = ~0; | 380 const int _ = ~0; |
| 333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,
_,_,_,_,_)); | 381 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,
_,_,_,_,_)); |
| 334 #else | 382 #else |
| 335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th
e pack we want. | 383 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th
e pack we want. |
| 336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); | 384 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); |
| 337 return _mm_packs_epi32(x,x); | 385 return _mm_packs_epi32(x,x); |
| 338 #endif | 386 #endif |
| 339 } | 387 } |
| 340 | 388 |
| 341 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { | 389 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
| 342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src)); | 390 return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src)); |
| 343 } | 391 } |
| 344 | 392 |
| 345 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { | 393 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
| 346 auto _32 = _mm_cvttps_epi32(src.fVec); | 394 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 395 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 348 const int _ = ~0; | 396 const int _ = ~0; |
| 349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 397 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
| 350 #else | 398 #else |
| 351 auto _16 = _mm_packus_epi16(_32, _32); | 399 auto _16 = _mm_packus_epi16(_32, _32); |
| 352 return _mm_packus_epi16(_16, _16); | 400 return _mm_packus_epi16(_16, _16); |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 384 } | 432 } |
| 385 | 433 |
| 386 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src)
{ | 434 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src)
{ |
| 387 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); | 435 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); |
| 388 } | 436 } |
| 389 | 437 |
| 390 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ | 438 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ |
| 391 return _mm_packus_epi16(src.fVec, src.fVec); | 439 return _mm_packus_epi16(src.fVec, src.fVec); |
| 392 } | 440 } |
| 393 | 441 |
| 394 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { | 442 template<> /*static*/ inline Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src)
{ |
| 395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); | 443 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); |
| 396 } | 444 } |
| 397 | 445 |
| 398 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { | 446 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { |
| 399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); | 447 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); |
| 400 } | 448 } |
| 401 | 449 |
| 402 static inline Sk4i Sk4f_round(const Sk4f& x) { | 450 static inline Sk4i Sk4f_round(const Sk4f& x) { |
| 403 return _mm_cvtps_epi32(x.fVec); | 451 return _mm_cvtps_epi32(x.fVec); |
| 404 } | 452 } |
| 405 | 453 |
| 406 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h*
a) { | 454 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h*
a) { |
| 407 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), | 455 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), |
| 408 hi = _mm_loadu_si128(((__m128i*)ptr) + 1); | 456 hi = _mm_loadu_si128(((__m128i*)ptr) + 1); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 420 const Sk4h& a) { | 468 const Sk4h& a) { |
| 421 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); | 469 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); |
| 422 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); | 470 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); |
| 423 __m128i lo = _mm_unpacklo_epi32(rg, ba); | 471 __m128i lo = _mm_unpacklo_epi32(rg, ba); |
| 424 __m128i hi = _mm_unpackhi_epi32(rg, ba); | 472 __m128i hi = _mm_unpackhi_epi32(rg, ba); |
| 425 _mm_storeu_si128(((__m128i*) dst) + 0, lo); | 473 _mm_storeu_si128(((__m128i*) dst) + 0, lo); |
| 426 _mm_storeu_si128(((__m128i*) dst) + 1, hi); | 474 _mm_storeu_si128(((__m128i*) dst) + 1, hi); |
| 427 } | 475 } |
| 428 | 476 |
| 429 #endif//SkNx_sse_DEFINED | 477 #endif//SkNx_sse_DEFINED |
| OLD | NEW |