Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(281)

Side by Side Diff: src/core/Sk4x4f.h

Issue 1828613002: Sk4x4f: NEON impl. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: rebase Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef Sk4x4f_DEFINED 8 #ifndef Sk4x4f_DEFINED
9 #define Sk4x4f_DEFINED 9 #define Sk4x4f_DEFINED
10 10
11 #include "SkNx.h" 11 #include "SkNx.h"
12 12
13 struct Sk4x4f { 13 struct Sk4x4f {
14 Sk4f r,g,b,a; 14 Sk4f r,g,b,a;
15 15
16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&); 16 static Sk4x4f Transpose(const Sk4f&, const Sk4f&, const Sk4f&, const Sk4f&);
17 static Sk4x4f Transpose(const float[16]); 17 static Sk4x4f Transpose(const float[16]);
18 static Sk4x4f Transpose(const uint8_t[16]); 18 static Sk4x4f Transpose(const uint8_t[16]);
19 19
20 void transpose(Sk4f*, Sk4f*, Sk4f*, Sk4f*) const; 20 void transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {
21 auto t = Transpose(r,g,b,a);
22 *x = t.r;
23 *y = t.g;
24 *z = t.b;
25 *w = t.a;
26 }
21 void transpose( float[16]) const; 27 void transpose( float[16]) const;
22 void transpose(uint8_t[16]) const; 28 void transpose(uint8_t[16]) const;
23 }; 29 };
24 30
25 // TODO: NEON
26
27 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 31 #if 1 && !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
28 32
29 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) { 33 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {
30 auto r = x.fVec, 34 auto r = x.fVec,
31 g = y.fVec, 35 g = y.fVec,
32 b = z.fVec, 36 b = z.fVec,
33 a = w.fVec; 37 a = w.fVec;
34 _MM_TRANSPOSE4_PS(r,g,b,a); 38 _MM_TRANSPOSE4_PS(r,g,b,a);
35 return { r,g,b,a }; 39 return { r,g,b,a };
36 } 40 }
37 41
38 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) { 42 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {
39 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12)); 43 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));
40 } 44 }
41 45
42 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) { 46 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {
43 auto b16 = _mm_loadu_si128((const __m128i*)bs); 47 auto b16 = _mm_loadu_si128((const __m128i*)bs);
44 48
45 auto mask = _mm_set1_epi32(0xFF); 49 auto mask = _mm_set1_epi32(0xFF);
46 auto r = _mm_cvtepi32_ps(_mm_and_si128(mask, (b16 ))), 50 auto r = _mm_cvtepi32_ps(_mm_and_si128(mask, (b16 ))),
47 g = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 8))), 51 g = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 8))),
48 b = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 16))), 52 b = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(b16, 16))),
49 a = _mm_cvtepi32_ps( _mm_srli_epi32(b16, 24)); 53 a = _mm_cvtepi32_ps( _mm_srli_epi32(b16, 24));
50 return { r,g,b,a }; 54 return { r,g,b,a };
51 } 55 }
52 56
53 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {
54 auto R = r.fVec,
55 G = g.fVec,
56 B = b.fVec,
57 A = a.fVec;
58 _MM_TRANSPOSE4_PS(R,G,B,A);
59 *x = R;
60 *y = G;
61 *z = B;
62 *w = A;
63 }
64
65 inline void Sk4x4f::transpose(float fs[16]) const { 57 inline void Sk4x4f::transpose(float fs[16]) const {
66 Sk4f x,y,z,w; 58 Sk4f x,y,z,w;
67 this->transpose(&x,&y,&z,&w); 59 this->transpose(&x,&y,&z,&w);
68 x.store(fs+ 0); 60 x.store(fs+ 0);
69 y.store(fs+ 4); 61 y.store(fs+ 4);
70 z.store(fs+ 8); 62 z.store(fs+ 8);
71 w.store(fs+12); 63 w.store(fs+12);
72 } 64 }
73 65
74 inline void Sk4x4f::transpose(uint8_t bs[16]) const { 66 inline void Sk4x4f::transpose(uint8_t bs[16]) const {
75 auto R = _mm_cvttps_epi32(r.fVec), 67 auto R = _mm_cvttps_epi32(r.fVec),
76 G = _mm_slli_epi32(_mm_cvttps_epi32(g.fVec), 8), 68 G = _mm_slli_epi32(_mm_cvttps_epi32(g.fVec), 8),
77 B = _mm_slli_epi32(_mm_cvttps_epi32(b.fVec), 16), 69 B = _mm_slli_epi32(_mm_cvttps_epi32(b.fVec), 16),
78 A = _mm_slli_epi32(_mm_cvttps_epi32(a.fVec), 24); 70 A = _mm_slli_epi32(_mm_cvttps_epi32(a.fVec), 24);
79 _mm_storeu_si128((__m128i*)bs, _mm_or_si128(A, _mm_or_si128(B, _mm_or_si128( G, R)))); 71 _mm_storeu_si128((__m128i*)bs, _mm_or_si128(A, _mm_or_si128(B, _mm_or_si128( G, R))));
80 } 72 }
81 73
74 #elif defined(SK_ARM_HAS_NEON)
75
76 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {
77 float32x4x2_t xy = vuzpq_f32(x.fVec, y.fVec),
78 zw = vuzpq_f32(z.fVec, w.fVec),
79 rb = vuzpq_f32(xy.val[0], zw.val[0]),
80 ga = vuzpq_f32(xy.val[1], zw.val[1]);
81 return { rb.val[0], ga.val[0], rb.val[1], ga.val[1] };
82 }
83
84 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {
85 float32x4x4_t v = vld4q_f32(fs);
86 return { v.val[0], v.val[1], v.val[2], v.val[3] };
87 }
88
89 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {
90 auto b16 = vreinterpretq_u32_u8(vld1q_u8(bs));
91 auto r = vcvtq_f32_u32(vandq_u32(vdupq_n_u32(0x000000FF), b16) ),
msarett 2016/03/24 17:00:50 Woohoo this is cool!
92 g = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0x0000FF00), b16), 8),
93 b = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0x00FF0000), b16), 16),
94 a = vcvtq_n_f32_u32(vandq_u32(vdupq_n_u32(0xFF000000), b16), 24);
95 return { r,g,b,a };
96 }
97
98 inline void Sk4x4f::transpose(float fs[16]) const {
99 float32x4x4_t v = {{ r.fVec, g.fVec, b.fVec, a.fVec }};
100 vst4q_f32(fs, v);
101 }
102
103 inline void Sk4x4f::transpose(uint8_t bs[16]) const {
104 auto R = vandq_u32(vdupq_n_u32(0x000000FF), vcvtq_u32_f32(r.fVec )),
105 G = vandq_u32(vdupq_n_u32(0x0000FF00), vcvtq_n_u32_f32(g.fVec, 8)),
106 B = vandq_u32(vdupq_n_u32(0x00FF0000), vcvtq_n_u32_f32(b.fVec, 16)),
107 A = vandq_u32(vdupq_n_u32(0xFF000000), vcvtq_n_u32_f32(a.fVec, 24));
108 vst1q_u8(bs, vreinterpretq_u8_u32(vorrq_u32(A, vorrq_u32(B, vorrq_u32(G, R)) )));
109 }
110
82 #else 111 #else
83 112
84 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) { 113 inline Sk4x4f Sk4x4f::Transpose(const Sk4f& x, const Sk4f& y, const Sk4f& z, con st Sk4f& w) {
85 return { 114 return {
86 { x[0], y[0], z[0], w[0] }, 115 { x[0], y[0], z[0], w[0] },
87 { x[1], y[1], z[1], w[1] }, 116 { x[1], y[1], z[1], w[1] },
88 { x[2], y[2], z[2], w[2] }, 117 { x[2], y[2], z[2], w[2] },
89 { x[3], y[3], z[3], w[3] }, 118 { x[3], y[3], z[3], w[3] },
90 }; 119 };
91 } 120 }
92 121
93 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) { 122 inline Sk4x4f Sk4x4f::Transpose(const float fs[16]) {
94 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12)); 123 return Transpose(Sk4f::Load(fs+0), Sk4f::Load(fs+4), Sk4f::Load(fs+8), Sk4f: :Load(fs+12));
95 } 124 }
96 125
97 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) { 126 inline Sk4x4f Sk4x4f::Transpose(const uint8_t bs[16]) {
98 return { 127 return {
99 { (float)bs[0], (float)bs[4], (float)bs[ 8], (float)bs[12] }, 128 { (float)bs[0], (float)bs[4], (float)bs[ 8], (float)bs[12] },
100 { (float)bs[1], (float)bs[5], (float)bs[ 9], (float)bs[13] }, 129 { (float)bs[1], (float)bs[5], (float)bs[ 9], (float)bs[13] },
101 { (float)bs[2], (float)bs[6], (float)bs[10], (float)bs[14] }, 130 { (float)bs[2], (float)bs[6], (float)bs[10], (float)bs[14] },
102 { (float)bs[3], (float)bs[7], (float)bs[11], (float)bs[15] }, 131 { (float)bs[3], (float)bs[7], (float)bs[11], (float)bs[15] },
103 }; 132 };
104 } 133 }
105 134
106 inline void Sk4x4f::transpose(Sk4f* x, Sk4f* y, Sk4f* z, Sk4f* w) const {
107 *x = { r[0], g[0], b[0], a[0] };
108 *y = { r[1], g[1], b[1], a[1] };
109 *z = { r[2], g[2], b[2], a[2] };
110 *w = { r[3], g[3], b[3], a[3] };
111 }
112
113 inline void Sk4x4f::transpose(float fs[16]) const { 135 inline void Sk4x4f::transpose(float fs[16]) const {
114 Sk4f x,y,z,w; 136 Sk4f x,y,z,w;
115 this->transpose(&x,&y,&z,&w); 137 this->transpose(&x,&y,&z,&w);
116 x.store(fs+ 0); 138 x.store(fs+ 0);
117 y.store(fs+ 4); 139 y.store(fs+ 4);
118 z.store(fs+ 8); 140 z.store(fs+ 8);
119 w.store(fs+12); 141 w.store(fs+12);
120 } 142 }
121 143
122 inline void Sk4x4f::transpose(uint8_t bs[16]) const { 144 inline void Sk4x4f::transpose(uint8_t bs[16]) const {
123 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0]; 145 bs[ 0] = (uint8_t)r[0]; bs[ 1] = (uint8_t)g[0]; bs[ 2] = (uint8_t)b[0]; bs[ 3] = (uint8_t)a[0];
124 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1]; 146 bs[ 4] = (uint8_t)r[1]; bs[ 5] = (uint8_t)g[1]; bs[ 6] = (uint8_t)b[1]; bs[ 7] = (uint8_t)a[1];
125 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2]; 147 bs[ 8] = (uint8_t)r[2]; bs[ 9] = (uint8_t)g[2]; bs[10] = (uint8_t)b[2]; bs[1 1] = (uint8_t)a[2];
126 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3]; 148 bs[12] = (uint8_t)r[3]; bs[13] = (uint8_t)g[3]; bs[14] = (uint8_t)b[3]; bs[1 5] = (uint8_t)a[3];
127 } 149 }
128 150
129 #endif 151 #endif
130 152
131 #endif//Sk4x4f_DEFINED 153 #endif//Sk4x4f_DEFINED
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698