OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
10 | 10 |
(...skipping 332 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
343 __m128i fVec; | 343 __m128i fVec; |
344 }; | 344 }; |
345 | 345 |
346 | 346 |
347 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 347 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { |
348 return _mm_cvttps_epi32(src.fVec); | 348 return _mm_cvttps_epi32(src.fVec); |
349 } | 349 } |
350 | 350 |
351 template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { | 351 template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { |
352 auto _32 = _mm_cvttps_epi32(src.fVec); | 352 auto _32 = _mm_cvttps_epi32(src.fVec); |
353 return _mm_packus_epi16(_32, _32); | 353 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. |
| 354 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 355 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
| 356 const int _ = ~0; |
| 357 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); |
| 358 #else |
| 359 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: |
| 360 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); |
| 361 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); |
| 362 #endif |
354 } | 363 } |
355 | 364 |
356 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | 365 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { |
357 auto _32 = _mm_cvttps_epi32(src.fVec); | 366 auto _32 = _mm_cvttps_epi32(src.fVec); |
358 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 367 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
359 const int _ = ~0; | 368 const int _ = ~0; |
360 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 369 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
361 #else | 370 #else |
362 auto _16 = _mm_packus_epi16(_32, _32); | 371 auto _16 = _mm_packus_epi16(_32, _32); |
363 return _mm_packus_epi16(_16, _16); | 372 return _mm_packus_epi16(_16, _16); |
(...skipping 30 matching lines...) Expand all Loading... |
394 } | 403 } |
395 | 404 |
396 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 405 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { |
397 return _mm_packus_epi16(src.fVec, src.fVec); | 406 return _mm_packus_epi16(src.fVec, src.fVec); |
398 } | 407 } |
399 | 408 |
400 | 409 |
401 } // namespace | 410 } // namespace |
402 | 411 |
403 #endif//SkNx_sse_DEFINED | 412 #endif//SkNx_sse_DEFINED |
OLD | NEW |