OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
10 | 10 |
(...skipping 305 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
316 }; | 316 }; |
317 | 317 |
318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { | 318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { |
319 return _mm_cvtepi32_ps(src.fVec); | 319 return _mm_cvtepi32_ps(src.fVec); |
320 } | 320 } |
321 | 321 |
322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { | 322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { |
323 return _mm_cvttps_epi32(src.fVec); | 323 return _mm_cvttps_epi32(src.fVec); |
324 } | 324 } |
325 | 325 |
326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { | 326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { |
327 auto _32 = _mm_cvttps_epi32(src.fVec); | 327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
328 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. | 328 // TODO: This seems to be causing code generation problems. Investigate? |
329 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 329 return _mm_packus_epi32(src.fVec); |
| 330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
330 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. | 331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
331 const int _ = ~0; | 332 const int _ = ~0; |
332 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); | 333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,
_,_,_,_,_)); |
333 #else | 334 #else |
334 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: | 335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th
e pack we want. |
335 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); | 336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); |
336 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); | 337 return _mm_packs_epi32(x,x); |
337 #endif | 338 #endif |
338 } | 339 } |
339 | 340 |
| 341 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
| 342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src)); |
| 343 } |
| 344 |
340 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { | 345 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
341 auto _32 = _mm_cvttps_epi32(src.fVec); | 346 auto _32 = _mm_cvttps_epi32(src.fVec); |
342 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
343 const int _ = ~0; | 348 const int _ = ~0; |
344 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
345 #else | 350 #else |
346 auto _16 = _mm_packus_epi16(_32, _32); | 351 auto _16 = _mm_packus_epi16(_32, _32); |
347 return _mm_packus_epi16(_16, _16); | 352 return _mm_packus_epi16(_16, _16); |
348 #endif | 353 #endif |
349 } | 354 } |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
383 } | 388 } |
384 | 389 |
385 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ | 390 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ |
386 return _mm_packus_epi16(src.fVec, src.fVec); | 391 return _mm_packus_epi16(src.fVec, src.fVec); |
387 } | 392 } |
388 | 393 |
389 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { | 394 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { |
390 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); | 395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); |
391 } | 396 } |
392 | 397 |
393 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) { | |
394 // TODO: merge with other work exploring best int -> uint16_t conversion. | |
395 | |
396 // Sign extend to trick _mm_packs_epi32() into doing the pack we want. | |
397 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); | |
398 return _mm_packs_epi32(x,x); | |
399 } | |
400 | |
401 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { | 398 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { |
402 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); | 399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); |
403 } | 400 } |
404 | 401 |
405 static inline Sk4i Sk4f_round(const Sk4f& x) { | 402 static inline Sk4i Sk4f_round(const Sk4f& x) { |
406 return _mm_cvtps_epi32(x.fVec); | 403 return _mm_cvtps_epi32(x.fVec); |
407 } | 404 } |
408 | 405 |
409 #endif//SkNx_sse_DEFINED | 406 #endif//SkNx_sse_DEFINED |
OLD | NEW |