src/opts/SkNx_sse.h - Issue 1690633003: SkNx refactoring

Side by Side Diff: src/opts/SkNx_sse.h

Issue 1690633003: SkNx refactoring (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: fix comments Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkNx_sse_DEFINED	8 #ifndef SkNx_sse_DEFINED

9 #define SkNx_sse_DEFINED	9 #define SkNx_sse_DEFINED

10	10

(...skipping 277 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
288	288

289 SkNx thenElse(const SkNx& t, const SkNx& e) const {	289 SkNx thenElse(const SkNx& t, const SkNx& e) const {

290 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),	290 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),

291 _mm_andnot_si128(fVec, e.fVec));	291 _mm_andnot_si128(fVec, e.fVec));

292 }	292 }

293	293

294 __m128i fVec;	294 __m128i fVec;

295 };	295 };

296	296

297	297

298 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {	298 template<> /static/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {

299 auto _32 = _mm_cvttps_epi32(src.fVec);	299 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());

	300 }

	301 template<> /static/ inline Sk4i SkNx_cast< int, uint8_t>(const Sk4b& src) {

	302 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	303 const int _ = ~0;

	304 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3 ,_,_,_));

	305 #else

	306 return _mm_unpacklo_epi16(SkNx_cast<uint16_t>(src).fVec, _mm_setzero_si128() );

	307 #endif

	308 }

	309

	310 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {

	311 return _mm_packus_epi16(src.fVec, src.fVec);

	312 }

	313 template<> /static/ inline Sk4i SkNx_cast< int, uint16_t>(const Sk4h& src) {

	314 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());

	315 }

	316

	317 template<> /static/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {

	318 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	319 const int _ = ~0;

	320 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_));

	321 #else

	322 // We're on our way to 8-bit anyway, so we don't care that _mm_packs_epi32 c lamps to int16_t.

	323 Sk4h _16 = _mm_packs_epi32(src.fVec, src.fVec);

	324 return SkNx_cast<uint8_t>(_16);

	325 #endif

	326 }

	327 template<> /static/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {

	328 auto _32 = src.fVec;

300 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+.	329 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+.

301 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	330 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

302 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.	331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.

303 const int _ = ~0;	332 const int _ = ~0;

304 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_));	333 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_ ,_,_));

305 #else	334 #else

306 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:	335 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:

307 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));	336 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));

308 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 ));	337 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000 ));

309 #endif	338 #endif

310 }	339 }

311	340

312 template<> /static/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {	341 template<> /static/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {

313 auto _32 = _mm_cvttps_epi32(src.fVec);	342 return _mm_cvtepi32_ps(src.fVec);

314 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	343 }

315 const int _ = ~0;	344 template<> /static/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {

316 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));	345 return SkNx_cast<float>(SkNx_cast<int>(src));

317 #else	346 }

318 auto _16 = _mm_packus_epi16(_32, _32);	347 template<> /static/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {

319 return _mm_packus_epi16(_16, _16);	348 return SkNx_cast<float>(SkNx_cast<int>(src));

320 #endif

321 }	349 }

322	350

323 template<> /static/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {	351 template<> /static/ inline Sk4i SkNx_cast< int, float>(const Sk4f& src) {

324 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	352 return _mm_cvttps_epi32(src.fVec);

325 const int _ = ~0;

326 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_, _, 3,_,_,_));

327 #else

328 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),

329 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128());

330 #endif

331 return _mm_cvtepi32_ps(_32);

332 }	353 }

333	354 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {

334 template<> /static/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {	355 return SkNx_cast<uint16_t>(SkNx_cast<int>(src));

335 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());	356 }

336 return _mm_cvtepi32_ps(_32);	357 template<> /static/ inline Sk4b SkNx_cast< uint8_t, float>(const Sk4f& src) {

	358 return SkNx_cast<uint8_t>(SkNx_cast<int>(src));

337 }	359 }

338	360

339 static inline void Sk4f_ToBytes(uint8_t bytes[16],	361 static inline void Sk4f_ToBytes(uint8_t bytes[16],

340 const Sk4f& a, const Sk4f& b, const Sk4f& c, con st Sk4f& d) {	362 const Sk4f& a, const Sk4f& b, const Sk4f& c, con st Sk4f& d) {

	363 // We're on our way to 8-bit anyway, so we don't care that _mm_packs_epi32 c lamps to int16_t.

341 _mm_storeu_si128((__m128i*)bytes,	364 _mm_storeu_si128((__m128i*)bytes,

342 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),	365 _mm_packus_epi16(_mm_packs_epi32(_mm_cvttps_epi32(a.fVec),

343 _mm_cvttps_epi32(b.fVec)) ,	366 _mm_cvttps_epi32(b.fVec)),

344 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec),	367 _mm_packs_epi32(_mm_cvttps_epi32(c.fVec),

345 _mm_cvttps_epi32(d.fVec)) ));	368 _mm_cvttps_epi32(d.fVec))) );

346 }

347

348 template<> /static/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {

349 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());

350 }

351

352 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {

353 return _mm_packus_epi16(src.fVec, src.fVec);

354 }	369 }

355	370

356 #endif//SkNx_sse_DEFINED	371 #endif//SkNx_sse_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkNx.h ('k') | no next file » | no next file with comments »