Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(381)

Side by Side Diff: src/opts/SkNx_sse.h

Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: so tired of this MSVC... Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « bench/pack_int_uint16_t_Bench.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2015 Google Inc. 2 * Copyright 2015 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkNx_sse_DEFINED 8 #ifndef SkNx_sse_DEFINED
9 #define SkNx_sse_DEFINED 9 #define SkNx_sse_DEFINED
10 10
(...skipping 305 matching lines...) Expand 10 before | Expand all | Expand 10 after
316 }; 316 };
317 317
318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { 318 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
319 return _mm_cvtepi32_ps(src.fVec); 319 return _mm_cvtepi32_ps(src.fVec);
320 } 320 }
321 321
322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { 322 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
323 return _mm_cvttps_epi32(src.fVec); 323 return _mm_cvttps_epi32(src.fVec);
324 } 324 }
325 325
326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 326 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
327 auto _32 = _mm_cvttps_epi32(src.fVec); 327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
328 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. 328 // TODO: This seems to be causing code generation problems. Investigate?
329 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 329 return _mm_packus_epi32(src.fVec);
330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
330 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
331 const int _ = ~0; 332 const int _ = ~0;
332 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_)); 333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_));
333 #else 334 #else
334 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: 335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want.
335 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); 336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
336 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 )); 337 return _mm_packs_epi32(x,x);
337 #endif 338 #endif
338 } 339 }
339 340
341 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src));
343 }
344
340 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 345 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
341 auto _32 = _mm_cvttps_epi32(src.fVec); 346 auto _32 = _mm_cvttps_epi32(src.fVec);
342 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
343 const int _ = ~0; 348 const int _ = ~0;
344 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_)); 349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));
345 #else 350 #else
346 auto _16 = _mm_packus_epi16(_32, _32); 351 auto _16 = _mm_packus_epi16(_32, _32);
347 return _mm_packus_epi16(_16, _16); 352 return _mm_packus_epi16(_16, _16);
348 #endif 353 #endif
349 } 354 }
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
383 } 388 }
384 389
385 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 390 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
386 return _mm_packus_epi16(src.fVec, src.fVec); 391 return _mm_packus_epi16(src.fVec, src.fVec);
387 } 392 }
388 393
389 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) { 394 template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
390 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
391 } 396 }
392 397
393 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
394 // TODO: merge with other work exploring best int -> uint16_t conversion.
395
396 // Sign extend to trick _mm_packs_epi32() into doing the pack we want.
397 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
398 return _mm_packs_epi32(x,x);
399 }
400
401 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) { 398 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
402 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); 399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
403 } 400 }
404 401
405 static inline Sk4i Sk4f_round(const Sk4f& x) { 402 static inline Sk4i Sk4f_round(const Sk4f& x) {
406 return _mm_cvtps_epi32(x.fVec); 403 return _mm_cvtps_epi32(x.fVec);
407 } 404 }
408 405
409 #endif//SkNx_sse_DEFINED 406 #endif//SkNx_sse_DEFINED
OLDNEW
« no previous file with comments | « bench/pack_int_uint16_t_Bench.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698