Index: src/opts/SkNx_sse.h |
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h |
index 3e66637df3d551fdd681179527dc4d1c293aaee7..0c9bb4cf02b81fab16a47a3dc2cf1bd1855a7f8f 100644 |
--- a/src/opts/SkNx_sse.h |
+++ b/src/opts/SkNx_sse.h |
@@ -478,4 +478,35 @@ static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk |
_mm_storeu_si128(((__m128i*) dst) + 1, hi); |
} |
+static inline void Sk4f_load4(const void* ptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) { |
mtklein
2016/09/14 20:30:36
May want to call _MM_TRANSPOSE_PS, or crib off it:
msarett
2016/09/14 21:09:43
Cool! Done.
|
+ __m128 v0 = _mm_loadu_ps(((float*)ptr) + 0), |
+ v1 = _mm_loadu_ps(((float*)ptr) + 4), |
+ v2 = _mm_loadu_ps(((float*)ptr) + 8), |
+ v3 = _mm_loadu_ps(((float*)ptr) + 12); |
+ __m128 rg01 = _mm_unpacklo_ps(v0, v1), // r0 r1 g0 g1 |
+ ba01 = _mm_unpackhi_ps(v0, v1), // b0 b1 a0 a1 |
+ rg23 = _mm_unpacklo_ps(v2, v3), // r2 r3 g2 g3 |
+ ba23 = _mm_unpacklo_ps(v2, v3); // b2 b3 a2 a3 |
+ *r = _mm_shuffle_ps(rg01, rg23, 0x88); // 00 01 00 01 |
+ *g = _mm_shuffle_ps(rg01, rg23, 0xEE); // 10 11 10 11 |
+ *b = _mm_shuffle_ps(ba01, ba23, 0x88); // 00 01 00 01 |
+ *a = _mm_shuffle_ps(ba01, ba23, 0xEE); // 10 11 10 11 |
+} |
+ |
+static inline void Sk4f_store4(void* dst, const Sk4f& r, const Sk4f& g, const Sk4f& b, |
+ const Sk4f& a) { |
+ __m128 rg01 = _mm_unpacklo_ps(r.fVec, g.fVec); |
+ __m128 rg23 = _mm_unpackhi_ps(r.fVec, g.fVec); |
+ __m128 ba01 = _mm_unpacklo_ps(b.fVec, a.fVec); |
+ __m128 ba23 = _mm_unpacklo_ps(b.fVec, a.fVec); |
+ __m128 v0 = _mm_shuffle_ps(rg01, ba01, 0x88); |
+ __m128 v1 = _mm_shuffle_ps(rg01, ba01, 0xEE); |
+ __m128 v2 = _mm_shuffle_ps(rg23, ba23, 0x88); |
+ __m128 v3 = _mm_shuffle_ps(rg23, ba23, 0xEE); |
+ _mm_storeu_ps(((float*) dst) + 0, v0); |
+ _mm_storeu_ps(((float*) dst) + 4, v1); |
+ _mm_storeu_ps(((float*) dst) + 8, v2); |
+ _mm_storeu_ps(((float*) dst) + 12, v3); |
+} |
+ |
#endif//SkNx_sse_DEFINED |