| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
| 10 | 10 |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 104 | 104 |
| 105 template <> | 105 template <> |
| 106 class SkNx<4, float> { | 106 class SkNx<4, float> { |
| 107 public: | 107 public: |
| 108 SkNx(const __m128& vec) : fVec(vec) {} | 108 SkNx(const __m128& vec) : fVec(vec) {} |
| 109 | 109 |
| 110 SkNx() {} | 110 SkNx() {} |
| 111 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} | 111 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} |
| 112 static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } | 112 static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } |
| 113 | 113 |
| 114 static SkNx FromBytes(const uint8_t bytes[4]) { | |
| 115 __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes); | |
| 116 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 117 const char _ = ~0; // Zero these bytes. | |
| 118 __m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_,
2,_,_,_, 3,_,_,_)); | |
| 119 #else | |
| 120 __m128i fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), | |
| 121 fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); | |
| 122 #endif | |
| 123 return SkNx(_mm_cvtepi32_ps(fix8_32)); | |
| 124 // TODO: use _mm_cvtepu8_epi32 w/SSE4.1? | |
| 125 } | |
| 126 | |
| 127 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 114 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} |
| 128 | 115 |
| 129 void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } | 116 void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } |
| 130 void toBytes(uint8_t bytes[4]) const { | |
| 131 __m128i fix8_32 = _mm_cvttps_epi32(fVec), | |
| 132 fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), | |
| 133 fix8 = _mm_packus_epi16(fix8_16, fix8_16); | |
| 134 *(int*)bytes = _mm_cvtsi128_si32(fix8); | |
| 135 } | |
| 136 | |
| 137 static void ToBytes(uint8_t bytes[16], | |
| 138 const SkNx& a, const SkNx& b, const SkNx& c, const SkNx&
d) { | |
| 139 _mm_storeu_si128((__m128i*)bytes, | |
| 140 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fV
ec), | |
| 141 _mm_cvttps_epi32(b.fV
ec)), | |
| 142 _mm_packus_epi16(_mm_cvttps_epi32(c.fV
ec), | |
| 143 _mm_cvttps_epi32(d.fV
ec)))); | |
| 144 } | |
| 145 | 117 |
| 146 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 118 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
| 147 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 119 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
| 148 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 120 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
| 149 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 121 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
| 150 | 122 |
| 151 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 123 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
| 152 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 124 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
| 153 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 125 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
| 154 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 126 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 246 | 218 |
| 247 template <int k> uint16_t kth() const { | 219 template <int k> uint16_t kth() const { |
| 248 SkASSERT(0 <= k && k < 8); | 220 SkASSERT(0 <= k && k < 8); |
| 249 return _mm_extract_epi16(fVec, k); | 221 return _mm_extract_epi16(fVec, k); |
| 250 } | 222 } |
| 251 | 223 |
| 252 __m128i fVec; | 224 __m128i fVec; |
| 253 }; | 225 }; |
| 254 | 226 |
| 255 template <> | 227 template <> |
| 228 class SkNx<4, uint8_t> { |
| 229 public: |
| 230 SkNx(const __m128i& vec) : fVec(vec) {} |
| 231 |
| 232 SkNx() {} |
| 233 static SkNx Load(const uint8_t vals[4]) { return _mm_cvtsi32_si128(*(const i
nt*)vals); } |
| 234 void store(uint8_t vals[4]) const { *(int*)vals = _mm_cvtsi128_si32(fVec); } |
| 235 |
| 236 // TODO as needed |
| 237 |
| 238 __m128i fVec; |
| 239 }; |
| 240 |
| 241 template <> |
| 242 class SkNx<8, uint8_t> { |
| 243 public: |
| 244 SkNx(const __m128i& vec) : fVec(vec) {} |
| 245 |
| 246 SkNx() {} |
| 247 static SkNx Load(const uint8_t vals[8]) { return _mm_loadl_epi64((const __m1
28i*)vals); } |
| 248 void store(uint8_t vals[8]) const { _mm_storel_epi64((__m128i*)vals, fVec);
} |
| 249 |
| 250 // TODO as needed |
| 251 |
| 252 __m128i fVec; |
| 253 }; |
| 254 |
| 255 template <> |
| 256 class SkNx<16, uint8_t> { | 256 class SkNx<16, uint8_t> { |
| 257 public: | 257 public: |
| 258 SkNx(const __m128i& vec) : fVec(vec) {} | 258 SkNx(const __m128i& vec) : fVec(vec) {} |
| 259 | 259 |
| 260 SkNx() {} | 260 SkNx() {} |
| 261 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 261 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} |
| 262 static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
128i*)vals); } | 262 static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
128i*)vals); } |
| 263 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 263 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
| 264 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 264 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
| 265 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 265 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
| (...skipping 23 matching lines...) Expand all Loading... |
| 289 | 289 |
| 290 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 290 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 291 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 291 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 292 _mm_andnot_si128(fVec, e.fVec)); | 292 _mm_andnot_si128(fVec, e.fVec)); |
| 293 } | 293 } |
| 294 | 294 |
| 295 __m128i fVec; | 295 __m128i fVec; |
| 296 }; | 296 }; |
| 297 | 297 |
| 298 | 298 |
| 299 template<> | 299 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { |
| 300 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { | |
| 301 return _mm_cvttps_epi32(src.fVec); | 300 return _mm_cvttps_epi32(src.fVec); |
| 302 } | 301 } |
| 303 | 302 |
| 303 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { |
| 304 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 305 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 306 const int _ = ~0; |
| 307 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
| 308 #else |
| 309 auto _16 = _mm_packus_epi16(_32, _32); |
| 310 return _mm_packus_epi16(_16, _16); |
| 311 #endif |
| 312 } |
| 313 |
| 314 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { |
| 315 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 316 const int _ = ~0; |
| 317 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); |
| 318 #else |
| 319 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), |
| 320 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); |
| 321 #endif |
| 322 return _mm_cvtepi32_ps(_32); |
| 323 } |
| 324 |
| 325 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 326 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 327 _mm_storeu_si128((__m128i*)bytes, |
| 328 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), |
| 329 _mm_cvttps_epi32(b.fVec))
, |
| 330 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), |
| 331 _mm_cvttps_epi32(d.fVec))
)); |
| 332 } |
| 333 |
| 334 |
| 304 } // namespace | 335 } // namespace |
| 305 | 336 |
| 306 #endif//SkNx_sse_DEFINED | 337 #endif//SkNx_sse_DEFINED |
| OLD | NEW |