| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
| 10 | 10 |
| 11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo
re recent. | 11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything mo
re recent. |
| 12 // If you do, make sure this is in a static inline function... anywhere else ris
ks violating ODR. |
| 12 | 13 |
| 13 #define SKNX_IS_FAST | 14 #define SKNX_IS_FAST |
| 14 | 15 |
| 15 namespace { // See SkNx.h | |
| 16 | |
| 17 | |
| 18 template <> | 16 template <> |
| 19 class SkNx<2, float> { | 17 class SkNx<2, float> { |
| 20 public: | 18 public: |
| 21 SkNx(const __m128& vec) : fVec(vec) {} | 19 SkNx(const __m128& vec) : fVec(vec) {} |
| 22 | 20 |
| 23 SkNx() {} | 21 SkNx() {} |
| 24 SkNx(float val) : fVec(_mm_set1_ps(val)) {} | 22 SkNx(float val) : fVec(_mm_set1_ps(val)) {} |
| 25 static SkNx Load(const void* ptr) { | 23 static SkNx Load(const void* ptr) { |
| 26 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); | 24 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); |
| 27 } | 25 } |
| 28 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} | 26 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} |
| 29 | 27 |
| 30 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } | 28 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } |
| 31 | 29 |
| 32 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 30 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
| 33 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 31 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
| 34 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 32 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
| 35 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 33 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
| 36 | 34 |
| 37 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 35 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
| 38 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 36 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
| 39 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 37 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
| 40 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 38 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
| 41 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} | 39 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} |
| 42 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} | 40 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} |
| 43 | 41 |
| 44 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } | 42 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } |
| 45 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } | 43 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } |
| 46 | 44 |
| 47 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } | 45 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } |
| 48 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } | 46 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } |
| 49 SkNx rsqrt1() const { return this->rsqrt0(); } | 47 SkNx rsqrt1() const { return this->rsqrt0(); } |
| 50 SkNx rsqrt2() const { return this->rsqrt1(); } | 48 SkNx rsqrt2() const { return this->rsqrt1(); } |
| 51 | 49 |
| 52 SkNx invert() const { return SkNx(1) / *this; } | 50 SkNx invert() const { return SkNx(1) / *this; } |
| 53 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } | 51 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } |
| 54 | 52 |
| 55 template <int k> float kth() const { | 53 float operator[](int k) const { |
| 56 SkASSERT(0 <= k && k < 2); | 54 SkASSERT(0 <= k && k < 2); |
| 57 union { __m128 v; float fs[4]; } pun = {fVec}; | 55 union { __m128 v; float fs[4]; } pun = {fVec}; |
| 58 return pun.fs[k&1]; | 56 return pun.fs[k&1]; |
| 59 } | 57 } |
| 58 template <int k> float kth() const { return (*this)[k]; } |
| 60 | 59 |
| 61 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } | 60 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } |
| 62 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } | 61 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fV
ec)) & 0xff); } |
| 63 | 62 |
| 64 __m128 fVec; | 63 __m128 fVec; |
| 65 }; | 64 }; |
| 66 | 65 |
| 67 template <> | 66 template <> |
| 68 class SkNx<2, double> { | |
| 69 public: | |
| 70 SkNx(const __m128d& vec) : fVec(vec) {} | |
| 71 | |
| 72 SkNx() {} | |
| 73 SkNx(double val) : fVec(_mm_set1_pd(val)) {} | |
| 74 static SkNx Load(const void* ptr) { return _mm_loadu_pd((const double*)ptr);
} | |
| 75 SkNx(double a, double b) : fVec(_mm_setr_pd(a,b)) {} | |
| 76 | |
| 77 void store(void* ptr) const { _mm_storeu_pd((double*)ptr, fVec); } | |
| 78 | |
| 79 SkNx operator + (const SkNx& o) const { return _mm_add_pd(fVec, o.fVec); } | |
| 80 SkNx operator - (const SkNx& o) const { return _mm_sub_pd(fVec, o.fVec); } | |
| 81 SkNx operator * (const SkNx& o) const { return _mm_mul_pd(fVec, o.fVec); } | |
| 82 SkNx operator / (const SkNx& o) const { return _mm_div_pd(fVec, o.fVec); } | |
| 83 | |
| 84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_pd (fVec, o.fVec);
} | |
| 85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_pd(fVec, o.fVec);
} | |
| 86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_pd (fVec, o.fVec);
} | |
| 87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_pd (fVec, o.fVec);
} | |
| 88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_pd (fVec, o.fVec);
} | |
| 89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_pd (fVec, o.fVec);
} | |
| 90 | |
| 91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_pd(l.fVec, r.
fVec); } | |
| 92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_pd(l.fVec, r.
fVec); } | |
| 93 | |
| 94 SkNx sqrt() const { return _mm_sqrt_pd(fVec); } | |
| 95 | |
| 96 template <int k> double kth() const { | |
| 97 SkASSERT(0 <= k && k < 2); | |
| 98 union { __m128d v; double fs[2]; } pun = {fVec}; | |
| 99 return pun.fs[k&1]; | |
| 100 } | |
| 101 | |
| 102 bool allTrue() const { return 0x3 == _mm_movemask_pd(fVec); } | |
| 103 bool anyTrue() const { return 0x0 != _mm_movemask_pd(fVec); } | |
| 104 | |
| 105 SkNx thenElse(const SkNx& t, const SkNx& e) const { | |
| 106 return _mm_or_pd(_mm_and_pd (fVec, t.fVec), | |
| 107 _mm_andnot_pd(fVec, e.fVec)); | |
| 108 } | |
| 109 | |
| 110 __m128d fVec; | |
| 111 }; | |
| 112 | |
| 113 template <> | |
| 114 class SkNx<4, int> { | |
| 115 public: | |
| 116 SkNx(const __m128i& vec) : fVec(vec) {} | |
| 117 | |
| 118 SkNx() {} | |
| 119 SkNx(int val) : fVec(_mm_set1_epi32(val)) {} | |
| 120 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } | |
| 121 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} | |
| 122 | |
| 123 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } | |
| 124 | |
| 125 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec);
} | |
| 126 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec);
} | |
| 127 SkNx operator * (const SkNx& o) const { | |
| 128 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), | |
| 129 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.
fVec, 4)); | |
| 130 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0))
, | |
| 131 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))
); | |
| 132 } | |
| 133 | |
| 134 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } | |
| 135 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } | |
| 136 | |
| 137 template <int k> int kth() const { | |
| 138 SkASSERT(0 <= k && k < 4); | |
| 139 switch (k) { | |
| 140 case 0: return _mm_cvtsi128_si32(fVec); | |
| 141 case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4)); | |
| 142 case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8)); | |
| 143 case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12)); | |
| 144 default: SkASSERT(false); return 0; | |
| 145 } | |
| 146 } | |
| 147 | |
| 148 __m128i fVec; | |
| 149 }; | |
| 150 | |
| 151 template <> | |
| 152 class SkNx<4, float> { | 67 class SkNx<4, float> { |
| 153 public: | 68 public: |
| 154 SkNx(const __m128& vec) : fVec(vec) {} | 69 SkNx(const __m128& vec) : fVec(vec) {} |
| 155 | 70 |
| 156 SkNx() {} | 71 SkNx() {} |
| 157 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} | 72 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} |
| 158 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr);
} | 73 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr);
} |
| 159 | 74 |
| 160 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} | 75 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} |
| 161 | 76 |
| 162 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } | 77 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } |
| 163 | 78 |
| 164 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } | 79 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } |
| 165 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } | 80 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } |
| 166 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } | 81 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } |
| 167 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } | 82 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } |
| 168 | 83 |
| 169 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} | 84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec);
} |
| 170 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} | 85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec);
} |
| 171 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} | 86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec);
} |
| 172 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} | 87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec);
} |
| 173 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} | 88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec);
} |
| 174 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} | 89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec);
} |
| 175 | 90 |
| 176 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } | 91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.
fVec); } |
| 177 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } | 92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.
fVec); } |
| 178 | 93 |
| 179 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } | 94 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } |
| 180 | 95 |
| 181 SkNx sqrt() const { return _mm_sqrt_ps (fVec); } | 96 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } |
| 182 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } | 97 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } |
| 183 SkNx rsqrt1() const { return this->rsqrt0(); } | 98 SkNx rsqrt1() const { return this->rsqrt0(); } |
| 184 SkNx rsqrt2() const { return this->rsqrt1(); } | 99 SkNx rsqrt2() const { return this->rsqrt1(); } |
| 185 | 100 |
| 186 SkNx invert() const { return SkNx(1) / *this; } | 101 SkNx invert() const { return SkNx(1) / *this; } |
| 187 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } | 102 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } |
| 188 | 103 |
| 189 template <int k> float kth() const { | 104 float operator[](int k) const { |
| 190 SkASSERT(0 <= k && k < 4); | 105 SkASSERT(0 <= k && k < 4); |
| 191 union { __m128 v; float fs[4]; } pun = {fVec}; | 106 union { __m128 v; float fs[4]; } pun = {fVec}; |
| 192 return pun.fs[k&3]; | 107 return pun.fs[k&3]; |
| 193 } | 108 } |
| 109 template <int k> float kth() const { return (*this)[k]; } |
| 194 | 110 |
| 195 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } | 111 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } |
| 196 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } | 112 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(f
Vec)); } |
| 197 | 113 |
| 198 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 114 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 199 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), | 115 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), |
| 200 _mm_andnot_ps(fVec, e.fVec)); | 116 _mm_andnot_ps(fVec, e.fVec)); |
| 201 } | 117 } |
| 202 | 118 |
| 203 __m128 fVec; | 119 __m128 fVec; |
| (...skipping 11 matching lines...) Expand all Loading... |
| 215 | 131 |
| 216 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } | 132 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } |
| 217 | 133 |
| 218 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec);
} | 134 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec);
} |
| 219 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec);
} | 135 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec);
} |
| 220 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec)
; } | 136 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec)
; } |
| 221 | 137 |
| 222 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } | 138 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } |
| 223 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } | 139 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } |
| 224 | 140 |
| 225 template <int k> uint16_t kth() const { | 141 uint16_t operator[](int k) const { |
| 226 SkASSERT(0 <= k && k < 4); | 142 SkASSERT(0 <= k && k < 4); |
| 227 return _mm_extract_epi16(fVec, k); | 143 union { __m128i v; uint16_t us[8]; } pun = {fVec}; |
| 144 return pun.us[k&3]; |
| 228 } | 145 } |
| 146 template <int k> uint16_t kth() const { return (*this)[k]; } |
| 229 | 147 |
| 230 __m128i fVec; | 148 __m128i fVec; |
| 231 }; | 149 }; |
| 232 | 150 |
| 233 template <> | 151 template <> |
| 234 class SkNx<8, uint16_t> { | 152 class SkNx<8, uint16_t> { |
| 235 public: | 153 public: |
| 236 SkNx(const __m128i& vec) : fVec(vec) {} | 154 SkNx(const __m128i& vec) : fVec(vec) {} |
| 237 | 155 |
| 238 SkNx() {} | 156 SkNx() {} |
| (...skipping 18 matching lines...) Expand all Loading... |
| 257 const __m128i top_8x = _mm_set1_epi16(top); | 175 const __m128i top_8x = _mm_set1_epi16(top); |
| 258 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), | 176 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), |
| 259 _mm_sub_epi8(b.fVec, top_8x)))
; | 177 _mm_sub_epi8(b.fVec, top_8x)))
; |
| 260 } | 178 } |
| 261 | 179 |
| 262 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 180 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 263 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 181 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 264 _mm_andnot_si128(fVec, e.fVec)); | 182 _mm_andnot_si128(fVec, e.fVec)); |
| 265 } | 183 } |
| 266 | 184 |
| 267 template <int k> uint16_t kth() const { | 185 uint16_t operator[](int k) const { |
| 268 SkASSERT(0 <= k && k < 8); | 186 SkASSERT(0 <= k && k < 8); |
| 269 return _mm_extract_epi16(fVec, k); | 187 union { __m128i v; uint16_t us[8]; } pun = {fVec}; |
| 188 return pun.us[k&7]; |
| 270 } | 189 } |
| 190 template <int k> uint16_t kth() const { return (*this)[k]; } |
| 271 | 191 |
| 272 __m128i fVec; | 192 __m128i fVec; |
| 273 }; | 193 }; |
| 274 | 194 |
| 275 template <> | 195 template <> |
| 276 class SkNx<4, uint8_t> { | 196 class SkNx<4, uint8_t> { |
| 277 public: | 197 public: |
| 278 SkNx(const __m128i& vec) : fVec(vec) {} | 198 SkNx(const __m128i& vec) : fVec(vec) {} |
| 279 | 199 |
| 280 SkNx() {} | 200 SkNx() {} |
| 281 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt
r); } | 201 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)pt
r); } |
| 282 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } | 202 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } |
| 283 | 203 |
| 284 // TODO as needed | 204 // TODO as needed |
| 285 | 205 |
| 286 __m128i fVec; | 206 __m128i fVec; |
| 287 }; | 207 }; |
| 288 | 208 |
| 289 template <> | 209 template <> |
| 290 class SkNx<8, uint8_t> { | |
| 291 public: | |
| 292 SkNx(const __m128i& vec) : fVec(vec) {} | |
| 293 | |
| 294 SkNx() {} | |
| 295 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)p
tr); } | |
| 296 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } | |
| 297 | |
| 298 // TODO as needed | |
| 299 | |
| 300 __m128i fVec; | |
| 301 }; | |
| 302 | |
| 303 template <> | |
| 304 class SkNx<16, uint8_t> { | 210 class SkNx<16, uint8_t> { |
| 305 public: | 211 public: |
| 306 SkNx(const __m128i& vec) : fVec(vec) {} | 212 SkNx(const __m128i& vec) : fVec(vec) {} |
| 307 | 213 |
| 308 SkNx() {} | 214 SkNx() {} |
| 309 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} | 215 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} |
| 310 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } | 216 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)p
tr); } |
| 311 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 217 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
| 312 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 218 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
| 313 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 219 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
| 314 uint8_t m, uint8_t n, uint8_t o, uint8_t p) | 220 uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
| 315 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} | 221 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} |
| 316 | 222 |
| 317 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } | 223 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } |
| 318 | 224 |
| 319 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec);
} | 225 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec);
} |
| 320 | 226 |
| 321 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } | 227 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } |
| 322 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } | 228 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } |
| 323 | 229 |
| 324 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec,
b.fVec); } | 230 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec,
b.fVec); } |
| 325 SkNx operator < (const SkNx& o) const { | 231 SkNx operator < (const SkNx& o) const { |
| 326 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use
a signed compare. | 232 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use
a signed compare. |
| 327 auto flip = _mm_set1_epi8(char(0x80)); | 233 auto flip = _mm_set1_epi8(char(0x80)); |
| 328 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f
Vec)); | 234 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.f
Vec)); |
| 329 } | 235 } |
| 330 | 236 |
| 331 template <int k> uint8_t kth() const { | 237 uint8_t operator[](int k) const { |
| 332 SkASSERT(0 <= k && k < 16); | 238 SkASSERT(0 <= k && k < 16); |
| 333 // SSE4.1 would just `return _mm_extract_epi8(fVec, k)`. We have to rea
d 16-bits instead. | 239 union { __m128i v; uint8_t us[16]; } pun = {fVec}; |
| 334 int pair = _mm_extract_epi16(fVec, k/2); | 240 return pun.us[k&15]; |
| 335 return k % 2 == 0 ? pair : (pair >> 8); | |
| 336 } | 241 } |
| 242 template <int k> uint8_t kth() const { return (*this)[k]; } |
| 337 | 243 |
| 338 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 244 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 339 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), | 245 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), |
| 340 _mm_andnot_si128(fVec, e.fVec)); | 246 _mm_andnot_si128(fVec, e.fVec)); |
| 341 } | 247 } |
| 342 | 248 |
| 343 __m128i fVec; | 249 __m128i fVec; |
| 344 }; | 250 }; |
| 345 | 251 |
| 346 | 252 |
| 347 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 253 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
| 348 return _mm_cvttps_epi32(src.fVec); | |
| 349 } | |
| 350 | |
| 351 template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { | |
| 352 auto _32 = _mm_cvttps_epi32(src.fVec); | 254 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 353 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. | 255 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. |
| 354 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 256 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 355 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. | 257 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into
place. |
| 356 const int _ = ~0; | 258 const int _ = ~0; |
| 357 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); | 259 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_
,_,_)); |
| 358 #else | 260 #else |
| 359 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: | 261 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: |
| 360 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); | 262 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); |
| 361 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); | 263 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000
)); |
| 362 #endif | 264 #endif |
| 363 } | 265 } |
| 364 | 266 |
| 365 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | 267 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
| 366 auto _32 = _mm_cvttps_epi32(src.fVec); | 268 auto _32 = _mm_cvttps_epi32(src.fVec); |
| 367 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 269 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 368 const int _ = ~0; | 270 const int _ = ~0; |
| 369 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); | 271 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_
,_)); |
| 370 #else | 272 #else |
| 371 auto _16 = _mm_packus_epi16(_32, _32); | 273 auto _16 = _mm_packus_epi16(_32, _32); |
| 372 return _mm_packus_epi16(_16, _16); | 274 return _mm_packus_epi16(_16, _16); |
| 373 #endif | 275 #endif |
| 374 } | 276 } |
| 375 | 277 |
| 376 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { | 278 template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { |
| 377 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 279 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 378 const int _ = ~0; | 280 const int _ = ~0; |
| 379 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); | 281 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,
_, 3,_,_,_)); |
| 380 #else | 282 #else |
| 381 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), | 283 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), |
| 382 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); | 284 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); |
| 383 #endif | 285 #endif |
| 384 return _mm_cvtepi32_ps(_32); | 286 return _mm_cvtepi32_ps(_32); |
| 385 } | 287 } |
| 386 | 288 |
| 387 template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) { | 289 template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { |
| 388 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); | 290 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); |
| 389 return _mm_cvtepi32_ps(_32); | 291 return _mm_cvtepi32_ps(_32); |
| 390 } | 292 } |
| 391 | 293 |
| 392 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 294 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 393 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { | 295 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 394 _mm_storeu_si128((__m128i*)bytes, | 296 _mm_storeu_si128((__m128i*)bytes, |
| 395 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), | 297 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), |
| 396 _mm_cvttps_epi32(b.fVec))
, | 298 _mm_cvttps_epi32(b.fVec))
, |
| 397 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), | 299 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), |
| 398 _mm_cvttps_epi32(d.fVec))
)); | 300 _mm_cvttps_epi32(d.fVec))
)); |
| 399 } | 301 } |
| 400 | 302 |
| 401 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) { | 303 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src)
{ |
| 402 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); | 304 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); |
| 403 } | 305 } |
| 404 | 306 |
| 405 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 307 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
{ |
| 406 return _mm_packus_epi16(src.fVec, src.fVec); | 308 return _mm_packus_epi16(src.fVec, src.fVec); |
| 407 } | 309 } |
| 408 | 310 |
| 409 | |
| 410 } // namespace | |
| 411 | |
| 412 #endif//SkNx_sse_DEFINED | 311 #endif//SkNx_sse_DEFINED |
| OLD | NEW |