| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
| 10 | 10 |
| (...skipping 305 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 316 }; | 316 }; |
| 317 | 317 |
| 318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { | 318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { |
| 319 return _mm_cvtepi32_ps(src.fVec); | 319 return _mm_cvtepi32_ps(src.fVec); |
| 320 } | 320 } |
| 321 | 321 |
| 322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { | 322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { |
| 323 return _mm_cvttps_epi32(src.fVec); | 323 return _mm_cvttps_epi32(src.fVec); |
| 324 } | 324 } |
| 325 | 325 |
| 326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { | 326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { |
| 327 auto _32 = _mm_cvttps_epi32(src.fVec); | 327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 328 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. | 328 // TODO: This seems to be causing code generation problems. Investigate? |
| 329 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 329 return _mm_packus_epi32(src.fVec); |
| 330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 330 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. | 331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
| 331 const int _ = ~0; | 332 const int _ = ~0; |
| 332 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); | 333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,
_,_,_,_,_)); |
| 333 #else | 334 #else |
| 334 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: | 335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th
e pack we want. |
| 335 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); | 336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); |
| 336 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); | 337 return _mm_packs_epi32(x,x); |
| 337 #endif | 338 #endif |
| 338 } | 339 } |
| 339 | 340 |
| 341 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
| 342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src)); |
| 343 } |
| 344 |
| 340 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { | 345 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
| 341 auto _32 = _mm_cvttps_epi32(src.fVec); | 346 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 342 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 343 const int _ = ~0; | 348 const int _ = ~0; |
| 344 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
| 345 #else | 350 #else |
| 346 auto _16 = _mm_packus_epi16(_32, _32); | 351 auto _16 = _mm_packus_epi16(_32, _32); |
| 347 return _mm_packus_epi16(_16, _16); | 352 return _mm_packus_epi16(_16, _16); |
| 348 #endif | 353 #endif |
| 349 } | 354 } |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 383 } | 388 } |
| 384 | 389 |
| 385 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ | 390 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ |
| 386 return _mm_packus_epi16(src.fVec, src.fVec); | 391 return _mm_packus_epi16(src.fVec, src.fVec); |
| 387 } | 392 } |
| 388 | 393 |
| 389 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { | 394 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { |
| 390 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); | 395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); |
| 391 } | 396 } |
| 392 | 397 |
| 393 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { | |
| 394 // TODO: merge with other work exploring best int -> uint16_t conversion. | |
| 395 | |
| 396 // Sign extend to trick _mm_packs_epi32() into doing the pack we want. | |
| 397 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); | |
| 398 return _mm_packs_epi32(x,x); | |
| 399 } | |
| 400 | |
| 401 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { | 398 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { |
| 402 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); | 399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); |
| 403 } | 400 } |
| 404 | 401 |
| 405 static inline Sk4i Sk4f_round(const Sk4f& x) { | 402 static inline Sk4i Sk4f_round(const Sk4f& x) { |
| 406 return _mm_cvtps_epi32(x.fVec); | 403 return _mm_cvtps_epi32(x.fVec); |
| 407 } | 404 } |
| 408 | 405 |
| 409 #endif//SkNx_sse_DEFINED | 406 #endif//SkNx_sse_DEFINED |
| OLD | NEW |