| Index: src/opts/SkNx_sse.h | 
| diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h | 
| index a4594115e0ecff4e4223606da03c4ccba8fcbcd8..a4783c6302eec19f2231e852ced78ebdb67af588 100644 | 
| --- a/src/opts/SkNx_sse.h | 
| +++ b/src/opts/SkNx_sse.h | 
| @@ -544,6 +544,14 @@ public: | 
| __m256i fVec; | 
| }; | 
|  | 
| +    // _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d. | 
| +    AI static __m256 unpacklo_pd(__m256 x, __m256 y) { | 
| +        return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); | 
| +    } | 
| +    AI static __m256 unpackhi_pd(__m256 x, __m256 y) { | 
| +        return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); | 
| +    } | 
| + | 
| template <> | 
| class SkNx<8, float> { | 
| public: | 
| @@ -560,6 +568,29 @@ public: | 
| AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const float*)ptr); } | 
| AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); } | 
|  | 
| +        AI static void Store4(void* ptr, | 
| +                              const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { | 
| +            __m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec),  // r0 g0 r1 g1 | r4 g4 r5 g5 | 
| +                   rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec),  // r2 ...      | r6 ... | 
| +                   ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec),  // b0 a0 b1 a1 | b4 a4 b5 a5 | 
| +                   ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec);  // b2 ...      | b6 ... | 
| + | 
| +            __m256 _04 = unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4 | 
| +                   _15 = unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ... | 
| +                   _26 = unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ... | 
| +                   _37 = unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ... | 
| + | 
| +            __m256 _01 = _mm256_permute2f128_ps(_04, _15, 16),  // 16 == 010 000 == lo, lo | 
| +                   _23 = _mm256_permute2f128_ps(_26, _37, 16), | 
| +                   _45 = _mm256_permute2f128_ps(_04, _15, 25),  // 25 == 011 001 == hi, hi | 
| +                   _67 = _mm256_permute2f128_ps(_26, _37, 25); | 
| + | 
| +            _mm256_storeu_ps((float*)ptr + 0*8, _01); | 
| +            _mm256_storeu_ps((float*)ptr + 1*8, _23); | 
| +            _mm256_storeu_ps((float*)ptr + 2*8, _45); | 
| +            _mm256_storeu_ps((float*)ptr + 3*8, _67); | 
| +        } | 
| + | 
| AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); } | 
| AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); } | 
| AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); } | 
|  |