OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
10 | 10 |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
104 | 104 |
105 template <> | 105 template <> |
106 class SkNx<4, float> { | 106 class SkNx<4, float> { |
107 public: | 107 public: |
108 SkNx(const __m128& vec) : fVec(vec) {} | 108 SkNx(const __m128& vec) : fVec(vec) {} |
109 | 109 |
110 SkNx() {} | 110 SkNx() {} |
111 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} | 111 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} |
112 static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } | 112 static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } |
113 | 113 |
114 static SkNx FromBytes(const uint8_t bytes[4]) { | |
115 __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes); | |
116 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
117 const char _ = ~0; // Zero these bytes. | |
118 __m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_,
2,_,_,_, 3,_,_,_)); | |
119 #else | |
120 __m128i fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), | |
121 fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); | |
122 #endif | |
123 return SkNx(_mm_cvtepi32_ps(fix8_32)); | |
124 // TODO: use _mm_cvtepu8_epi32 w/SSE4.1? | |
125 } | |
126 | |
127 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 114 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} |
128 | 115 |
129 void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } | 116 void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } |
130 void toBytes(uint8_t bytes[4]) const { | |
131 __m128i fix8_32 = _mm_cvttps_epi32(fVec), | |
132 fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), | |
133 fix8 = _mm_packus_epi16(fix8_16, fix8_16); | |
134 *(int*)bytes = _mm_cvtsi128_si32(fix8); | |
135 } | |
136 | |
137 static void ToBytes(uint8_t bytes[16], | |
138 const SkNx& a, const SkNx& b, const SkNx& c, const SkNx&
d) { | |
139 _mm_storeu_si128((__m128i*)bytes, | |
140 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fV
ec), | |
141 _mm_cvttps_epi32(b.fV
ec)), | |
142 _mm_packus_epi16(_mm_cvttps_epi32(c.fV
ec), | |
143 _mm_cvttps_epi32(d.fV
ec)))); | |
144 } | |
145 | 117 |
146 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 118 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
147 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 119 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
148 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 120 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
149 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 121 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
150 | 122 |
151 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 123 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
152 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 124 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
153 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 125 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
154 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 126 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 | 218 |
247 template <int k> uint16_t kth() const { | 219 template <int k> uint16_t kth() const { |
248 SkASSERT(0 <= k && k < 8); | 220 SkASSERT(0 <= k && k < 8); |
249 return _mm_extract_epi16(fVec, k); | 221 return _mm_extract_epi16(fVec, k); |
250 } | 222 } |
251 | 223 |
252 __m128i fVec; | 224 __m128i fVec; |
253 }; | 225 }; |
254 | 226 |
255 template <> | 227 template <> |
| 228 class SkNx<4, uint8_t> { |
| 229 public: |
| 230 SkNx(const __m128i& vec) : fVec(vec) {} |
| 231 |
| 232 SkNx() {} |
| 233 static SkNx Load(const uint8_t vals[4]) { return _mm_cvtsi32_si128(*(const i
nt*)vals); } |
| 234 void store(uint8_t vals[4]) const { *(int*)vals = _mm_cvtsi128_si32(fVec); } |
| 235 |
| 236 // TODO as needed |
| 237 |
| 238 __m128i fVec; |
| 239 }; |
| 240 |
| 241 template <> |
| 242 class SkNx<8, uint8_t> { |
| 243 public: |
| 244 SkNx(const __m128i& vec) : fVec(vec) {} |
| 245 |
| 246 SkNx() {} |
| 247 static SkNx Load(const uint8_t vals[8]) { return _mm_loadl_epi64((const __m1
28i*)vals); } |
| 248 void store(uint8_t vals[8]) const { _mm_storel_epi64((__m128i*)vals, fVec);
} |
| 249 |
| 250 // TODO as needed |
| 251 |
| 252 __m128i fVec; |
| 253 }; |
| 254 |
| 255 template <> |
256 class SkNx<16, uint8_t> { | 256 class SkNx<16, uint8_t> { |
257 public: | 257 public: |
258 SkNx(const __m128i& vec) : fVec(vec) {} | 258 SkNx(const __m128i& vec) : fVec(vec) {} |
259 | 259 |
260 SkNx() {} | 260 SkNx() {} |
261 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 261 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} |
262 static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
128i*)vals); } | 262 static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
128i*)vals); } |
263 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 263 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
264 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 264 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
265 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 265 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
(...skipping 23 matching lines...) Expand all Loading... |
289 | 289 |
290 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 290 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
291 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 291 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
292 _mm_andnot_si128(fVec, e.fVec)); | 292 _mm_andnot_si128(fVec, e.fVec)); |
293 } | 293 } |
294 | 294 |
295 __m128i fVec; | 295 __m128i fVec; |
296 }; | 296 }; |
297 | 297 |
298 | 298 |
299 template<> | 299 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { |
300 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { | |
301 return _mm_cvttps_epi32(src.fVec); | 300 return _mm_cvttps_epi32(src.fVec); |
302 } | 301 } |
303 | 302 |
| 303 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { |
| 304 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 305 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 306 const int _ = ~0; |
| 307 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
| 308 #else |
| 309 auto _16 = _mm_packus_epi16(_32, _32); |
| 310 return _mm_packus_epi16(_16, _16); |
| 311 #endif |
| 312 } |
| 313 |
| 314 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { |
| 315 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 316 const int _ = ~0; |
| 317 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); |
| 318 #else |
| 319 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), |
| 320 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); |
| 321 #endif |
| 322 return _mm_cvtepi32_ps(_32); |
| 323 } |
| 324 |
| 325 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 326 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 327 _mm_storeu_si128((__m128i*)bytes, |
| 328 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), |
| 329 _mm_cvttps_epi32(b.fVec))
, |
| 330 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), |
| 331 _mm_cvttps_epi32(d.fVec))
)); |
| 332 } |
| 333 |
| 334 |
304 } // namespace | 335 } // namespace |
305 | 336 |
306 #endif//SkNx_sse_DEFINED | 337 #endif//SkNx_sse_DEFINED |
OLD | NEW |