src/core/Sk4x4f.h - Issue 1817353005: Sk4x4f: Simplify x86 down to SSE2.

Side by Side Diff: src/core/Sk4x4f.h

Issue 1817353005: Sk4x4f: Simplify x86 down to SSE2. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: derp Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef Sk4x4f_DEFINED	8 #ifndef Sk4x4f_DEFINED

9 #define Sk4x4f_DEFINED	9 #define Sk4x4f_DEFINED

10	10

11 #include "SkNx.h"	11 #include "SkNx.h"

12	12

13 struct Sk4x4f {	13 struct Sk4x4f {

14 Sk4f r,g,b,a;	14 Sk4f r,g,b,a;

15	15

16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&);	16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&);

17 static Sk4x4f Transpose(const float[16]);	17 static Sk4x4f Transpose(const float[16]);

18 static Sk4x4f Transpose(const uint8_t[16]);	18 static Sk4x4f Transpose(const uint8_t[16]);

19	19

20 void transpose(Sk4f, Sk4f, Sk4f, Sk4f) const;	20 void transpose(Sk4f, Sk4f, Sk4f, Sk4f) const;

21 void transpose( float[16]) const;	21 void transpose( float[16]) const;

22 void transpose(uint8_t[16]) const;	22 void transpose(uint8_t[16]) const;

23 };	23 };

24	24

25 // TODO: SSE2, NEON	25 // TODO: NEON

26	26

27 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	27 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

28	28

29 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {	29 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {

30 auto r = x.fVec,	30 auto r = x.fVec,

31 g = y.fVec,	31 g = y.fVec,

32 b = z.fVec,	32 b = z.fVec,

33 a = w.fVec;	33 a = w.fVec;

34 _MM_TRANSPOSE4_PS(r,g,b,a);	34 _MM_TRANSPOSE4_PS(r,g,b,a);

35 return { r,g,b,a };	35 return { r,g,b,a };

36 }	36 }

37	37

38 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {	38 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {

39 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));	39 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));

40 }	40 }

41	41

42 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {	42 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {

43 auto b16 = _mm_loadu_si128((const __m128i*)bs);	43 auto b16 = _mm_loadu_si128((const __m128i*)bs);

44 auto _ = ~0; // Shuffles in a zero byte.	44

45 auto r = _mm_cvtepi32_ps(	45 auto mask = _mm_set1_epi32(0xFF);

46 _mm_shuffle_epi8(b16, _mm_setr_epi8(0,_,_,_,4,_,_,_, 8,_,_,_,12,_,_, _)));	46 auto r = _mm_cvtepi32_ps(_mm_and_si128(mask, (b16 ))),

47 auto g = _mm_cvtepi32_ps(	47 g = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 8))),

48 _mm_shuffle_epi8(b16, _mm_setr_epi8(1,_,_,_,5,_,_,_, 9,_,_,_,13,_,_, _)));	48 b = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 16))),

49 auto b = _mm_cvtepi32_ps(	49 a = _mm_cvtepi32_ps( _mm_srli_epi32(b16, 24));

50 _mm_shuffle_epi8(b16, _mm_setr_epi8(2,_,_,_,6,_,_,_,10,_,_,_,14,_,_, _)));

51 auto a = _mm_cvtepi32_ps(

52 _mm_shuffle_epi8(b16, _mm_setr_epi8(3,_,_,_,7,_,_,_,11,_,_,_,15,_,_, _)));

53 return { r,g,b,a };	50 return { r,g,b,a };

54 }	51 }

55	52

56 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {	53 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {

57 auto R = r.fVec,	54 auto R = r.fVec,

58 G = g.fVec,	55 G = g.fVec,

59 B = b.fVec,	56 B = b.fVec,

60 A = a.fVec;	57 A = a.fVec;

61 _MM_TRANSPOSE4_PS(R,G,B,A);	58 _MM_TRANSPOSE4_PS(R,G,B,A);

62 *x = R;	59 *x = R;

63 *y = G;	60 *y = G;

64 *z = B;	61 *z = B;

65 *w = A;	62 *w = A;

66 }	63 }

67	64

68 inline void Sk4x4f::transpose(float fs[16]) const {	65 inline void Sk4x4f::transpose(float fs[16]) const {

69 Sk4f x,y,z,w;	66 Sk4f x,y,z,w;

70 this->transpose(&x,&y,&z,&w);	67 this->transpose(&x,&y,&z,&w);

71 x.store(fs+ 0);	68 x.store(fs+ 0);

72 y.store(fs+ 4);	69 y.store(fs+ 4);

73 z.store(fs+ 8);	70 z.store(fs+ 8);

74 w.store(fs+12);	71 w.store(fs+12);

75 }	72 }

76	73

77 inline void Sk4x4f::transpose(uint8_t bs[16]) const {	74 inline void Sk4x4f::transpose(uint8_t bs[16]) const {

78 auto packed = _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(r.fVec),	75 auto R = _mm_cvttps_epi32(r.fVec),

79 _mm_cvttps_epi32(g.fVec)),	76 G = _mm_slli_epi32(_mm_cvttps_epi32(g.fVec), 8),

80 _mm_packus_epi16(_mm_cvttps_epi32(b.fVec),	77 B = _mm_slli_epi32(_mm_cvttps_epi32(b.fVec), 16),

81 _mm_cvttps_epi32(a.fVec)));	78 A = _mm_slli_epi32(_mm_cvttps_epi32(a.fVec), 24);

82 _mm_storeu_si128((__m128i*)bs, _mm_shuffle_epi8(packed, _mm_setr_epi8(0, 4, 8, 12,	79 _mm_storeu_si128((__m128i*)bs, _mm_or_si128(A, _mm_or_si128(B, _mm_or_si128( G, R))));

83 1, 5, 9, 13,

84 2, 6, 10, 14,

85 3, 7, 11, 15)));

86 }	80 }

87	81

88 #else	82 #else

89	83

90 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {	84 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {

91 return {	85 return {

92 { x[0], y[0], z[0], w[0] },	86 { x[0], y[0], z[0], w[0] },

93 { x[1], y[1], z[1], w[1] },	87 { x[1], y[1], z[1], w[1] },

94 { x[2], y[2], z[2], w[2] },	88 { x[2], y[2], z[2], w[2] },

95 { x[3], y[3], z[3], w[3] },	89 { x[3], y[3], z[3], w[3] },

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
128 inline void Sk4x4f::transpose(uint8_t bs[16]) const {	122 inline void Sk4x4f::transpose(uint8_t bs[16]) const {

129 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0];	123 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0];

130 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1];	124 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1];

131 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2];	125 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2];

132 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3];	126 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3];

133 }	127 }

134	128

135 #endif	129 #endif

136	130

137 #endif//Sk4x4f_DEFINED	131 #endif//Sk4x4f_DEFINED

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »