src/opts/SkNx_sse.h - Issue 2197683002: SkNx: add Sk4u

Side by Side Diff: src/opts/SkNx_sse.h

Issue 2197683002: SkNx: add Sk4u (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkNx_sse_DEFINED	8 #ifndef SkNx_sse_DEFINED

9 #define SkNx_sse_DEFINED	9 #define SkNx_sse_DEFINED

10	10

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
123 #else	123 #else

124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec),	124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec),

125 _mm_andnot_ps(fVec, e.fVec));	125 _mm_andnot_ps(fVec, e.fVec));

126 #endif	126 #endif

127 }	127 }

128	128

129 __m128 fVec;	129 __m128 fVec;

130 };	130 };

131	131

132 template <>	132 template <>

133 class SkNx<4, int> {	133 class SkNx<4, int32_t> {

134 public:	134 public:

135 SkNx(const __m128i& vec) : fVec(vec) {}	135 SkNx(const __m128i& vec) : fVec(vec) {}

136	136

137 SkNx() {}	137 SkNx() {}

138 SkNx(int val) : fVec(_mm_set1_epi32(val)) {}	138 SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {}

139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }	139 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }

140 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}	140 SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c ,d)) {}

141	141

142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }	142 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }

143	143

144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }	144 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }

145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }	145 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }

146 SkNx operator * (const SkNx& o) const {	146 SkNx operator * (const SkNx& o) const {

147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),	147 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),

148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o. fVec, 4));	148 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o. fVec, 4));

149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)) ,	149 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)) ,

150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)) );	150 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)) );

151 }	151 }

152	152

153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }	153 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }

154 SkNx operator \| (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }	154 SkNx operator \| (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }

155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }	155 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }

156	156

157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }	157 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }

158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }	158 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }

159	159

160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); }	160 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); }

161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe c); }	161 SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVe c); }

162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe c); }	162 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVe c); }

163	163

164 int operator[](int k) const {	164 int32_t operator[](int k) const {

165 SkASSERT(0 <= k && k < 4);	165 SkASSERT(0 <= k && k < 4);

166 union { __m128i v; int is[4]; } pun = {fVec};	166 union { __m128i v; int32_t is[4]; } pun = {fVec};

167 return pun.is[k&3];	167 return pun.is[k&3];

168 }	168 }

169	169

170 SkNx thenElse(const SkNx& t, const SkNx& e) const {	170 SkNx thenElse(const SkNx& t, const SkNx& e) const {

171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41	171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec);	172 return _mm_blendv_epi8(e.fVec, t.fVec, fVec);

173 #else	173 #else

174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),	174 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),

175 _mm_andnot_si128(fVec, e.fVec));	175 _mm_andnot_si128(fVec, e.fVec));

176 #endif	176 #endif

177 }	177 }

178	178

179 __m128i fVec;	179 __m128i fVec;

180 };	180 };

181	181

182 template <>	182 template <>

	183 class SkNx<4, uint32_t> {

	184 public:

	185 SkNx(const __m128i& vec) : fVec(vec) {}

	186

	187 SkNx() {}

	188 SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {}

	189 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p tr); }

	190 SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a ,b,c,d)) {}

	191

	192 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }

	193

	194 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }

	195 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }

	196 // Not quite sure how to best do operator * in SSE2. We probably don't use it.

	197

	198 SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }

	199 SkNx operator \| (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }

	200 SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }

	201

	202 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }

	203 SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); }

	204

	205 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVe c); }

	206 // operator < and > take a little extra fiddling to make work for unsigned i nts.

	207

	208 uint32_t operator[](int k) const {

	209 SkASSERT(0 <= k && k < 4);

	210 union { __m128i v; uint32_t us[4]; } pun = {fVec};

	211 return pun.us[k&3];

	212 }

	213

	214 SkNx thenElse(const SkNx& t, const SkNx& e) const {

	215 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

	216 return _mm_blendv_epi8(e.fVec, t.fVec, fVec);

	217 #else

	218 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),

	219 _mm_andnot_si128(fVec, e.fVec));

	220 #endif

	221 }

	222

	223 __m128i fVec;

	224 };

	225

	226

	227 template <>

183 class SkNx<4, uint16_t> {	228 class SkNx<4, uint16_t> {

184 public:	229 public:

185 SkNx(const __m128i& vec) : fVec(vec) {}	230 SkNx(const __m128i& vec) : fVec(vec) {}

186	231

187 SkNx() {}	232 SkNx() {}

188 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}	233 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}

189 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p tr); }	234 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p tr); }

190 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a ,b,c,d,0,0,0,0)) {}	235 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a ,b,c,d,0,0,0,0)) {}

191	236

192 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }	237 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
308 }	353 }

309	354

310 SkNx thenElse(const SkNx& t, const SkNx& e) const {	355 SkNx thenElse(const SkNx& t, const SkNx& e) const {

311 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),	356 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),

312 _mm_andnot_si128(fVec, e.fVec));	357 _mm_andnot_si128(fVec, e.fVec));

313 }	358 }

314	359

315 __m128i fVec;	360 __m128i fVec;

316 };	361 };

317	362

318 template<> /static/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {	363 template<> /static/ inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {

319 return _mm_cvtepi32_ps(src.fVec);	364 return _mm_cvtepi32_ps(src.fVec);

320 }	365 }

	366 template<> /static/ inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {

	367 return SkNx_cast<float>(Sk4i::Load(&src));

	368 }

321	369

322 template <> /static/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {	370 template <> /static/ inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {

323 return _mm_cvttps_epi32(src.fVec);	371 return _mm_cvttps_epi32(src.fVec);

324 }	372 }

325	373

326 template<> /static/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {	374 template<> /static/ inline Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {

327 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41	375 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

328 // TODO: This seems to be causing code generation problems. Investigate?	376 // TODO: This seems to be causing code generation problems. Investigate?

329 return _mm_packus_epi32(src.fVec);	377 return _mm_packus_epi32(src.fVec);

330 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	378 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

331 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.	379 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.

332 const int _ = ~0;	380 const int _ = ~0;

333 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_));	381 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_, _,_,_,_,_));

334 #else	382 #else

335 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want.	383 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do th e pack we want.

336 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);	384 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);

337 return _mm_packs_epi32(x,x);	385 return _mm_packs_epi32(x,x);

338 #endif	386 #endif

339 }	387 }

340	388

341 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {	389 template<> /static/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {

342 return SkNx_cast<uint16_t>(SkNx_cast<int>(src));	390 return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src));

343 }	391 }

344	392

345 template<> /static/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {	393 template<> /static/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {

346 auto _32 = _mm_cvttps_epi32(src.fVec);	394 auto _32 = _mm_cvttps_epi32(src.fVec);

347 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	395 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

348 const int _ = ~0;	396 const int _ = ~0;

349 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));	397 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_ ,_));

350 #else	398 #else

351 auto _16 = _mm_packus_epi16(_32, _32);	399 auto _16 = _mm_packus_epi16(_32, _32);

352 return _mm_packus_epi16(_16, _16);	400 return _mm_packus_epi16(_16, _16);

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
384 }	432 }

385	433

386 template<> /static/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {	434 template<> /static/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {

387 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());	435 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());

388 }	436 }

389	437

390 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {	438 template<> /static/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {

391 return _mm_packus_epi16(src.fVec, src.fVec);	439 return _mm_packus_epi16(src.fVec, src.fVec);

392 }	440 }

393	441

394 template<> /static/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {	442 template<> /static/ inline Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {

395 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());	443 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());

396 }	444 }

397	445

398 template<> /static/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {	446 template<> /static/ inline Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {

399 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);	447 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);

400 }	448 }

401	449

402 static inline Sk4i Sk4f_round(const Sk4f& x) {	450 static inline Sk4i Sk4f_round(const Sk4f& x) {

403 return _mm_cvtps_epi32(x.fVec);	451 return _mm_cvtps_epi32(x.fVec);

404 }	452 }

405	453

406 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {	454 static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {

407 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),	455 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),

408 hi = _mm_loadu_si128(((__m128i*)ptr) + 1);	456 hi = _mm_loadu_si128(((__m128i*)ptr) + 1);

(...skipping 11 matching lines...) Expand all Loading...
420 const Sk4h& a) {	468 const Sk4h& a) {

421 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);	469 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);

422 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);	470 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);

423 __m128i lo = _mm_unpacklo_epi32(rg, ba);	471 __m128i lo = _mm_unpacklo_epi32(rg, ba);

424 __m128i hi = _mm_unpackhi_epi32(rg, ba);	472 __m128i hi = _mm_unpackhi_epi32(rg, ba);

425 _mm_storeu_si128(((__m128i*) dst) + 0, lo);	473 _mm_storeu_si128(((__m128i*) dst) + 0, lo);

426 _mm_storeu_si128(((__m128i*) dst) + 1, hi);	474 _mm_storeu_si128(((__m128i*) dst) + 1, hi);

427 }	475 }

428	476

429 #endif//SkNx_sse_DEFINED	477 #endif//SkNx_sse_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkNx_neon.h ('k') | no next file » | no next file with comments »