src/opts/SkNx_sse.h - Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE.

Side by Side Diff: src/opts/SkNx_sse.h

Issue 2150343002: Add a bench to measure the best way to pack from int to uint16_t with SSE. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: so tired of this MSVC... Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkNx_sse_DEFINED	8 #ifndef SkNx_sse_DEFINED

9 #define SkNx_sse_DEFINED	9 #define SkNx_sse_DEFINED

10	10

(...skipping 305 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
316 };	316 };

317	317

318 template<> /static/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {	318 template<> /static/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {

319 return _mm_cvtepi32_ps(src.fVec);	319 return _mm_cvtepi32_ps(src.fVec);

320 }	320 }

321	321

322 template <> /static/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {	322 template <> /static/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {

323 return _mm_cvttps_epi32(src.fVec);	323 return _mm_cvttps_epi32(src.fVec);

324 }	324 }

325	325

326 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {	326 template<> /static/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {

327 auto _32 = _mm_cvttps_epi32(src.fVec);	327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

328 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+.	328 // TODO: This seems to be causing code generation problems. Investigate?

329 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	329 return _mm_packus_epi32(src.fVec);

	330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

330 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.	331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.

331 const int _ = ~0;	332 const int _ = ~0;

332 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_));	333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_));

333 #else	334 #else

334 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:	335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want.

335 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));	336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);

336 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 ));	337 return _mm_packs_epi32(x,x);

337 #endif	338 #endif

338 }	339 }

339	340

	341 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {

	342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src));

	343 }

	344

340 template<> /static/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {	345 template<> /static/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {

341 auto _32 = _mm_cvttps_epi32(src.fVec);	346 auto _32 = _mm_cvttps_epi32(src.fVec);

342 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

343 const int _ = ~0;	348 const int _ = ~0;

344 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));	349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));

345 #else	350 #else

346 auto _16 = _mm_packus_epi16(_32, _32);	351 auto _16 = _mm_packus_epi16(_32, _32);

347 return _mm_packus_epi16(_16, _16);	352 return _mm_packus_epi16(_16, _16);

348 #endif	353 #endif

349 }	354 }

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
383 }	388 }

384	389

385 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {	390 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {

386 return _mm_packus_epi16(src.fVec, src.fVec);	391 return _mm_packus_epi16(src.fVec, src.fVec);

387 }	392 }

388	393

389 template<> /static/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {	394 template<> /static/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {

390 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());	395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());

391 }	396 }

392	397

393 template<> /static/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {

394 // TODO: merge with other work exploring best int -> uint16_t conversion.

395

396 // Sign extend to trick _mm_packs_epi32() into doing the pack we want.

397 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);

398 return _mm_packs_epi32(x,x);

399 }

400

401 template<> /static/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {	398 template<> /static/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {

402 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);	399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);

403 }	400 }

404	401

405 static inline Sk4i Sk4f_round(const Sk4f& x) {	402 static inline Sk4i Sk4f_round(const Sk4f& x) {

406 return _mm_cvtps_epi32(x.fVec);	403 return _mm_cvtps_epi32(x.fVec);

407 }	404 }

408	405

409 #endif//SkNx_sse_DEFINED	406 #endif//SkNx_sse_DEFINED

OLD	NEW

« no previous file with comments | « bench/pack_int_uint16_t_Bench.cpp ('k') | no next file » | no next file with comments »