Index: src/opts/SkNx_sse.h |
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h |
index a4594115e0ecff4e4223606da03c4ccba8fcbcd8..a4783c6302eec19f2231e852ced78ebdb67af588 100644 |
--- a/src/opts/SkNx_sse.h |
+++ b/src/opts/SkNx_sse.h |
@@ -544,6 +544,14 @@ public: |
__m256i fVec; |
}; |
+ // _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d. |
+ AI static __m256 unpacklo_pd(__m256 x, __m256 y) { |
+ return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); |
+ } |
+ AI static __m256 unpackhi_pd(__m256 x, __m256 y) { |
+ return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); |
+ } |
+ |
template <> |
class SkNx<8, float> { |
public: |
@@ -560,6 +568,29 @@ public: |
AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const float*)ptr); } |
AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); } |
+ AI static void Store4(void* ptr, |
+ const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { |
+ __m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec), // r0 g0 r1 g1 | r4 g4 r5 g5 |
+ rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec), // r2 ... | r6 ... |
+ ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec), // b0 a0 b1 a1 | b4 a4 b5 a5 |
+ ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec); // b2 ... | b6 ... |
+ |
+ __m256 _04 = unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 |
+ _15 = unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... |
+ _26 = unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... |
+ _37 = unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... |
+ |
+ __m256 _01 = _mm256_permute2f128_ps(_04, _15, 16), // 16 == 010 000 == lo, lo |
+ _23 = _mm256_permute2f128_ps(_26, _37, 16), |
+ _45 = _mm256_permute2f128_ps(_04, _15, 25), // 25 == 011 001 == hi, hi |
+ _67 = _mm256_permute2f128_ps(_26, _37, 25); |
+ |
+ _mm256_storeu_ps((float*)ptr + 0*8, _01); |
+ _mm256_storeu_ps((float*)ptr + 1*8, _23); |
+ _mm256_storeu_ps((float*)ptr + 2*8, _45); |
+ _mm256_storeu_ps((float*)ptr + 3*8, _67); |
+ } |
+ |
AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); } |
AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); } |
AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); } |