| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2015 Google Inc. | 2  * Copyright 2015 Google Inc. | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED | 
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED | 
| 10 | 10 | 
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 104 | 104 | 
| 105 template <> | 105 template <> | 
| 106 class SkNx<4, float> { | 106 class SkNx<4, float> { | 
| 107 public: | 107 public: | 
| 108     SkNx(const __m128& vec) : fVec(vec) {} | 108     SkNx(const __m128& vec) : fVec(vec) {} | 
| 109 | 109 | 
| 110     SkNx() {} | 110     SkNx() {} | 
| 111     SkNx(float val)           : fVec( _mm_set1_ps(val) ) {} | 111     SkNx(float val)           : fVec( _mm_set1_ps(val) ) {} | 
| 112     static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } | 112     static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); } | 
| 113 | 113 | 
| 114     static SkNx FromBytes(const uint8_t bytes[4]) { |  | 
| 115         __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes); |  | 
| 116     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |  | 
| 117         const char _ = ~0;  // Zero these bytes. |  | 
| 118         __m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_,
      2,_,_,_, 3,_,_,_)); |  | 
| 119     #else |  | 
| 120         __m128i fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()), |  | 
| 121                 fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); |  | 
| 122     #endif |  | 
| 123         return SkNx(_mm_cvtepi32_ps(fix8_32)); |  | 
| 124         // TODO: use _mm_cvtepu8_epi32 w/SSE4.1? |  | 
| 125     } |  | 
| 126 |  | 
| 127     SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 114     SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 
| 128 | 115 | 
| 129     void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } | 116     void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } | 
| 130     void toBytes(uint8_t bytes[4]) const { |  | 
| 131         __m128i fix8_32 = _mm_cvttps_epi32(fVec), |  | 
| 132                 fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), |  | 
| 133                 fix8    = _mm_packus_epi16(fix8_16, fix8_16); |  | 
| 134         *(int*)bytes = _mm_cvtsi128_si32(fix8); |  | 
| 135     } |  | 
| 136 |  | 
| 137     static void ToBytes(uint8_t bytes[16], |  | 
| 138                         const SkNx& a, const SkNx& b, const SkNx& c, const SkNx&
      d) { |  | 
| 139         _mm_storeu_si128((__m128i*)bytes, |  | 
| 140                          _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fV
     ec), |  | 
| 141                                                            _mm_cvttps_epi32(b.fV
     ec)), |  | 
| 142                                           _mm_packus_epi16(_mm_cvttps_epi32(c.fV
     ec), |  | 
| 143                                                            _mm_cvttps_epi32(d.fV
     ec)))); |  | 
| 144     } |  | 
| 145 | 117 | 
| 146     SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 118     SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 
| 147     SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 119     SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 
| 148     SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 120     SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 
| 149     SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 121     SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 
| 150 | 122 | 
| 151     SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
      } | 123     SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
      } | 
| 152     SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
      } | 124     SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
      } | 
| 153     SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
      } | 125     SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
      } | 
| 154     SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
      } | 126     SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
      } | 
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 246 | 218 | 
| 247     template <int k> uint16_t kth() const { | 219     template <int k> uint16_t kth() const { | 
| 248         SkASSERT(0 <= k && k < 8); | 220         SkASSERT(0 <= k && k < 8); | 
| 249         return _mm_extract_epi16(fVec, k); | 221         return _mm_extract_epi16(fVec, k); | 
| 250     } | 222     } | 
| 251 | 223 | 
| 252     __m128i fVec; | 224     __m128i fVec; | 
| 253 }; | 225 }; | 
| 254 | 226 | 
| 255 template <> | 227 template <> | 
|  | 228 class SkNx<4, uint8_t> { | 
|  | 229 public: | 
|  | 230     SkNx(const __m128i& vec) : fVec(vec) {} | 
|  | 231 | 
|  | 232     SkNx() {} | 
|  | 233     static SkNx Load(const uint8_t vals[4]) { return _mm_cvtsi32_si128(*(const i
     nt*)vals); } | 
|  | 234     void store(uint8_t vals[4]) const { *(int*)vals = _mm_cvtsi128_si32(fVec); } | 
|  | 235 | 
|  | 236     // TODO as needed | 
|  | 237 | 
|  | 238     __m128i fVec; | 
|  | 239 }; | 
|  | 240 | 
|  | 241 template <> | 
|  | 242 class SkNx<8, uint8_t> { | 
|  | 243 public: | 
|  | 244     SkNx(const __m128i& vec) : fVec(vec) {} | 
|  | 245 | 
|  | 246     SkNx() {} | 
|  | 247     static SkNx Load(const uint8_t vals[8]) { return _mm_loadl_epi64((const __m1
     28i*)vals); } | 
|  | 248     void store(uint8_t vals[8]) const { _mm_storel_epi64((__m128i*)vals, fVec); 
     } | 
|  | 249 | 
|  | 250     // TODO as needed | 
|  | 251 | 
|  | 252     __m128i fVec; | 
|  | 253 }; | 
|  | 254 | 
|  | 255 template <> | 
| 256 class SkNx<16, uint8_t> { | 256 class SkNx<16, uint8_t> { | 
| 257 public: | 257 public: | 
| 258     SkNx(const __m128i& vec) : fVec(vec) {} | 258     SkNx(const __m128i& vec) : fVec(vec) {} | 
| 259 | 259 | 
| 260     SkNx() {} | 260     SkNx() {} | 
| 261     SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 261     SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 
| 262     static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
     128i*)vals); } | 262     static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m
     128i*)vals); } | 
| 263     SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 263     SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 
| 264          uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 264          uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 
| 265          uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 265          uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 
| (...skipping 23 matching lines...) Expand all  Loading... | 
| 289 | 289 | 
| 290     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 290     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 
| 291         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec), | 291         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec), | 
| 292                             _mm_andnot_si128(fVec, e.fVec)); | 292                             _mm_andnot_si128(fVec, e.fVec)); | 
| 293     } | 293     } | 
| 294 | 294 | 
| 295     __m128i fVec; | 295     __m128i fVec; | 
| 296 }; | 296 }; | 
| 297 | 297 | 
| 298 | 298 | 
| 299 template<> | 299 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 
| 300 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { |  | 
| 301     return _mm_cvttps_epi32(src.fVec); | 300     return _mm_cvttps_epi32(src.fVec); | 
| 302 } | 301 } | 
| 303 | 302 | 
|  | 303 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | 
|  | 304     auto _32 = _mm_cvttps_epi32(src.fVec); | 
|  | 305 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 
|  | 306     const int _ = ~0; | 
|  | 307     return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
     ,_)); | 
|  | 308 #else | 
|  | 309     auto _16 = _mm_packus_epi16(_32, _32); | 
|  | 310     return     _mm_packus_epi16(_16, _16); | 
|  | 311 #endif | 
|  | 312 } | 
|  | 313 | 
|  | 314 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { | 
|  | 315 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 
|  | 316     const int _ = ~0; | 
|  | 317     auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
     _, 3,_,_,_)); | 
|  | 318 #else | 
|  | 319     auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), | 
|  | 320          _32 = _mm_unpacklo_epi16(_16,     _mm_setzero_si128()); | 
|  | 321 #endif | 
|  | 322     return _mm_cvtepi32_ps(_32); | 
|  | 323 } | 
|  | 324 | 
|  | 325 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 
|  | 326                                 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
     st Sk4f& d) { | 
|  | 327     _mm_storeu_si128((__m128i*)bytes, | 
|  | 328                      _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), | 
|  | 329                                                        _mm_cvttps_epi32(b.fVec))
     , | 
|  | 330                                       _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), | 
|  | 331                                                        _mm_cvttps_epi32(d.fVec))
     )); | 
|  | 332 } | 
|  | 333 | 
|  | 334 | 
| 304 }  // namespace | 335 }  // namespace | 
| 305 | 336 | 
| 306 #endif//SkNx_sse_DEFINED | 337 #endif//SkNx_sse_DEFINED | 
| OLD | NEW | 
|---|