src/core/Sk4x4f.h - Issue 1828613002: Sk4x4f: NEON impl.

Side by Side Diff: src/core/Sk4x4f.h

Issue 1828613002: Sk4x4f: NEON impl. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: rebase Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef Sk4x4f_DEFINED	8 #ifndef Sk4x4f_DEFINED

9 #define Sk4x4f_DEFINED	9 #define Sk4x4f_DEFINED

10	10

11 #include "SkNx.h"	11 #include "SkNx.h"

12	12

13 struct Sk4x4f {	13 struct Sk4x4f {

14 Sk4f r,g,b,a;	14 Sk4f r,g,b,a;

15	15

16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&);	16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&);

17 static Sk4x4f Transpose(const float[16]);	17 static Sk4x4f Transpose(const float[16]);

18 static Sk4x4f Transpose(const uint8_t[16]);	18 static Sk4x4f Transpose(const uint8_t[16]);

19	19

20 void transpose(Sk4f, Sk4f, Sk4f, Sk4f) const;	20 void transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {

	21 auto t = Transpose(r,g,b,a);

	22 *x = t.r;

	23 *y = t.g;

	24 *z = t.b;

	25 *w = t.a;

	26 }

21 void transpose( float[16]) const;	27 void transpose( float[16]) const;

22 void transpose(uint8_t[16]) const;	28 void transpose(uint8_t[16]) const;

23 };	29 };

24	30

25 // TODO: NEON

26

27 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	31 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

28	32

29 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {	33 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {

30 auto r = x.fVec,	34 auto r = x.fVec,

31 g = y.fVec,	35 g = y.fVec,

32 b = z.fVec,	36 b = z.fVec,

33 a = w.fVec;	37 a = w.fVec;

34 _MM_TRANSPOSE4_PS(r,g,b,a);	38 _MM_TRANSPOSE4_PS(r,g,b,a);

35 return { r,g,b,a };	39 return { r,g,b,a };

36 }	40 }

37	41

38 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {	42 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {

39 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));	43 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));

40 }	44 }

41	45

42 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {	46 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {

43 auto b16 = _mm_loadu_si128((const __m128i*)bs);	47 auto b16 = _mm_loadu_si128((const __m128i*)bs);

44	48

45 auto mask = _mm_set1_epi32(0xFF);	49 auto mask = _mm_set1_epi32(0xFF);

46 auto r = _mm_cvtepi32_ps(_mm_and_si128(mask, (b16 ))),	50 auto r = _mm_cvtepi32_ps(_mm_and_si128(mask, (b16 ))),

47 g = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 8))),	51 g = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 8))),

48 b = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 16))),	52 b = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 16))),

49 a = _mm_cvtepi32_ps( _mm_srli_epi32(b16, 24));	53 a = _mm_cvtepi32_ps( _mm_srli_epi32(b16, 24));

50 return { r,g,b,a };	54 return { r,g,b,a };

51 }	55 }

52	56

53 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {

54 auto R = r.fVec,

55 G = g.fVec,

56 B = b.fVec,

57 A = a.fVec;

58 _MM_TRANSPOSE4_PS(R,G,B,A);

59 *x = R;

60 *y = G;

61 *z = B;

62 *w = A;

63 }

64

65 inline void Sk4x4f::transpose(float fs[16]) const {	57 inline void Sk4x4f::transpose(float fs[16]) const {

66 Sk4f x,y,z,w;	58 Sk4f x,y,z,w;

67 this->transpose(&x,&y,&z,&w);	59 this->transpose(&x,&y,&z,&w);

68 x.store(fs+ 0);	60 x.store(fs+ 0);

69 y.store(fs+ 4);	61 y.store(fs+ 4);

70 z.store(fs+ 8);	62 z.store(fs+ 8);

71 w.store(fs+12);	63 w.store(fs+12);

72 }	64 }

73	65

74 inline void Sk4x4f::transpose(uint8_t bs[16]) const {	66 inline void Sk4x4f::transpose(uint8_t bs[16]) const {

75 auto R = _mm_cvttps_epi32(r.fVec),	67 auto R = _mm_cvttps_epi32(r.fVec),

76 G = _mm_slli_epi32(_mm_cvttps_epi32(g.fVec), 8),	68 G = _mm_slli_epi32(_mm_cvttps_epi32(g.fVec), 8),

77 B = _mm_slli_epi32(_mm_cvttps_epi32(b.fVec), 16),	69 B = _mm_slli_epi32(_mm_cvttps_epi32(b.fVec), 16),

78 A = _mm_slli_epi32(_mm_cvttps_epi32(a.fVec), 24);	70 A = _mm_slli_epi32(_mm_cvttps_epi32(a.fVec), 24);

79 _mm_storeu_si128((__m128i*)bs, _mm_or_si128(A, _mm_or_si128(B, _mm_or_si128( G, R))));	71 _mm_storeu_si128((__m128i*)bs, _mm_or_si128(A, _mm_or_si128(B, _mm_or_si128( G, R))));

80 }	72 }

81	73

	74 #elif defined(SK_ARM_HAS_NEON)

	75

	76 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {

	77 float32x4x2_t xy = vuzpq_f32(x.fVec, y.fVec),

	78 zw = vuzpq_f32(z.fVec, w.fVec),

	79 rb = vuzpq_f32(xy.val[0], zw.val[0]),

	80 ga = vuzpq_f32(xy.val[1], zw.val[1]);

	81 return { rb.val[0], ga.val[0], rb.val[1], ga.val[1] };

	82 }

	83

	84 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {

	85 float32x4x4_t v = vld4q_f32(fs);

	86 return { v.val[0], v.val[1], v.val[2], v.val[3] };

	87 }

	88

	89 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {

	90 auto b16 = vreinterpretq_u32_u8(vld1q_u8(bs));

	91 auto r = vcvtq_f32_u32(vandq_u32(vdupq_n_u32(0x000000FF), b16) ),
	msarett 2016/03/24 17:00:50 Woohoo this is cool! Woohoo this is cool!
	92 g = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0x0000FF00), b16), 8),

	93 b = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0x00FF0000), b16), 16),

	94 a = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0xFF000000), b16), 24);

	95 return { r,g,b,a };

	96 }

	97

	98 inline void Sk4x4f::transpose(float fs[16]) const {

	99 float32x4x4_t v = {{ r.fVec, g.fVec, b.fVec, a.fVec }};

	100 vst4q_f32(fs, v);

	101 }

	102

	103 inline void Sk4x4f::transpose(uint8_t bs[16]) const {

	104 auto R = vandq_u32(vdupq_n_u32(0x000000FF), vcvtq_u32_f32(r.fVec )),

	105 G = vandq_u32(vdupq_n_u32(0x0000FF00), vcvtq_n_u32_f32(g.fVec, 8)),

	106 B = vandq_u32(vdupq_n_u32(0x00FF0000), vcvtq_n_u32_f32(b.fVec, 16)),

	107 A = vandq_u32(vdupq_n_u32(0xFF000000), vcvtq_n_u32_f32(a.fVec, 24));

	108 vst1q_u8(bs, vreinterpretq_u8_u32(vorrq_u32(A, vorrq_u32(B, vorrq_u32(G, R)) )));

	109 }

	110

82 #else	111 #else

83	112

84 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {	113 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {

85 return {	114 return {

86 { x[0], y[0], z[0], w[0] },	115 { x[0], y[0], z[0], w[0] },

87 { x[1], y[1], z[1], w[1] },	116 { x[1], y[1], z[1], w[1] },

88 { x[2], y[2], z[2], w[2] },	117 { x[2], y[2], z[2], w[2] },

89 { x[3], y[3], z[3], w[3] },	118 { x[3], y[3], z[3], w[3] },

90 };	119 };

91 }	120 }

92	121

93 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {	122 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {

94 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));	123 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));

95 }	124 }

96	125

97 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {	126 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {

98 return {	127 return {

99 { (float)bs[0], (float)bs[4], (float)bs[ 8], (float)bs[12] },	128 { (float)bs[0], (float)bs[4], (float)bs[ 8], (float)bs[12] },

100 { (float)bs[1], (float)bs[5], (float)bs[ 9], (float)bs[13] },	129 { (float)bs[1], (float)bs[5], (float)bs[ 9], (float)bs[13] },

101 { (float)bs[2], (float)bs[6], (float)bs[10], (float)bs[14] },	130 { (float)bs[2], (float)bs[6], (float)bs[10], (float)bs[14] },

102 { (float)bs[3], (float)bs[7], (float)bs[11], (float)bs[15] },	131 { (float)bs[3], (float)bs[7], (float)bs[11], (float)bs[15] },

103 };	132 };

104 }	133 }

105	134

106 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {

107 *x = { r[0], g[0], b[0], a[0] };

108 *y = { r[1], g[1], b[1], a[1] };

109 *z = { r[2], g[2], b[2], a[2] };

110 *w = { r[3], g[3], b[3], a[3] };

111 }

112

113 inline void Sk4x4f::transpose(float fs[16]) const {	135 inline void Sk4x4f::transpose(float fs[16]) const {

114 Sk4f x,y,z,w;	136 Sk4f x,y,z,w;

115 this->transpose(&x,&y,&z,&w);	137 this->transpose(&x,&y,&z,&w);

116 x.store(fs+ 0);	138 x.store(fs+ 0);

117 y.store(fs+ 4);	139 y.store(fs+ 4);

118 z.store(fs+ 8);	140 z.store(fs+ 8);

119 w.store(fs+12);	141 w.store(fs+12);

120 }	142 }

121	143

122 inline void Sk4x4f::transpose(uint8_t bs[16]) const {	144 inline void Sk4x4f::transpose(uint8_t bs[16]) const {

123 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0];	145 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0];

124 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1];	146 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1];

125 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2];	147 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2];

126 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3];	148 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3];

127 }	149 }

128	150

129 #endif	151 #endif

130	152

131 #endif//Sk4x4f_DEFINED	153 #endif//Sk4x4f_DEFINED

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »