src/opts/Sk4x_neon.h - Issue 1020963002: Specialize Sk2d for ARM64

Side by Side Diff: src/opts/Sk4x_neon.h

Issue 1020963002: Specialize Sk2d for ARM64 (Closed) Base URL: https://skia.googlesource.com/skia@master

Patch Set: Avoid use of vset[q]_lane, instead intializing vectors directly. Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // It is important _not_ to put header guards here.	1 // It is important _not_ to put header guards here.

2 // This file will be intentionally included three times.	2 // This file will be intentionally included three times.

3	3

4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362	4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362

5	5

6 #if defined(SK4X_PREAMBLE)	6 #if defined(SK4X_PREAMBLE)

7 #include <arm_neon.h>	7 #include <arm_neon.h>

8	8

9 // Template metaprogramming to map scalar types to vector types.	9 // Template metaprogramming to map scalar types to vector types.

10 template <typename T> struct SkScalarToSIMD;	10 template <typename T> struct SkScalarToSIMD;

(...skipping 19 matching lines...) Expand all Loading...
30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; }	30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; }

31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {	31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {

32 fVec = other.fVec;	32 fVec = other.fVec;

33 return *this;	33 return *this;

34 }	34 }

35	35

36 // Sk4f Methods	36 // Sk4f Methods

37 #define M(...) template <> inline __VA_ARGS__ Sk4f::	37 #define M(...) template <> inline __VA_ARGS__ Sk4f::

38	38

39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {}	39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {}

40 M() Sk4x(float a, float b, float c, float d) {	40 M() Sk4x(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }

41 // NEON lacks an intrinsic to make this easy. It is recommended to avoid

42 // this constructor unless it is absolutely necessary.

43

44 // I am choosing to use the set lane intrinsics. Particularly, in the case

45 // of floating point, it is likely that the values are already in the right

46 // register file, so this may be the best approach. However, I am not

47 // certain that this is the fastest approach and experimentation might be

48 // useful.

49 fVec = vsetq_lane_f32(a, fVec, 0);

50 fVec = vsetq_lane_f32(b, fVec, 1);

51 fVec = vsetq_lane_f32(c, fVec, 2);

52 fVec = vsetq_lane_f32(d, fVec, 3);

53 }

54	41

55 // As far as I can tell, it's not possible to provide an alignment hint to	42 // As far as I can tell, it's not possible to provide an alignment hint to

56 // NEON using intrinsics. However, I think it is possible at the assembly	43 // NEON using intrinsics. However, I think it is possible at the assembly

57 // level if we want to get into that.	44 // level if we want to get into that.

58 // TODO: Write our own aligned load and store.	45 // TODO: Write our own aligned load and store.

59 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); }	46 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); }

60 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); }	47 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); }

61 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); }	48 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); }

62 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); }	49 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); }

63	50

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
123 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);	110 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);

124 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);	111 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);

125 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];	112 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];

126 }	113 }

127	114

128 // Sk4i Methods	115 // Sk4i Methods

129 #undef M	116 #undef M

130 #define M(...) template <> inline __VA_ARGS__ Sk4i::	117 #define M(...) template <> inline __VA_ARGS__ Sk4i::

131	118

132 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {}	119 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {}

133 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) {	120 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) { fVec = (int32x4_t) { a, b , c, d }; }

134 // NEON lacks an intrinsic to make this easy. It is recommended to avoid

135 // this constructor unless it is absolutely necessary.

136

137 // There are a few different implementation strategies.

138

139 // uint64_t ab_i = ((uint32_t) a) \| (((uint64_t) b) << 32);

140 // uint64_t cd_i = ((uint32_t) c) \| (((uint64_t) d) << 32);

141 // int32x2_t ab = vcreate_s32(ab_i);

142 // int32x2_t cd = vcreate_s32(cd_i);

143 // fVec = vcombine_s32(ab, cd);

144 // This might not be a bad idea for the integer case. Either way I think,

145 // we will need to move values from general registers to NEON registers.

146

147 // I am choosing to use the set lane intrinsics. I am not certain that

148 // this is the fastest approach. It may be useful to try the above code

149 // for integers.

150 fVec = vsetq_lane_s32(a, fVec, 0);

151 fVec = vsetq_lane_s32(b, fVec, 1);

152 fVec = vsetq_lane_s32(c, fVec, 2);

153 fVec = vsetq_lane_s32(d, fVec, 3);

154 }

155	121

156 // As far as I can tell, it's not possible to provide an alignment hint to	122 // As far as I can tell, it's not possible to provide an alignment hint to

157 // NEON using intrinsics. However, I think it is possible at the assembly	123 // NEON using intrinsics. However, I think it is possible at the assembly

158 // level if we want to get into that.	124 // level if we want to get into that.

159 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); }	125 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); }

160 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); }	126 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); }

161 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); }	127 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); }

162 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); }	128 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); }

163	129

164 template <>	130 template <>

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
217 }	183 }

218 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {	184 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {

219 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);	185 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);

220 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);	186 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);

221 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];	187 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];

222 }	188 }

223	189

224 #undef M	190 #undef M

225	191

226 #endif	192 #endif

OLD	NEW

« no previous file with comments | « src/opts/Sk2x_neon.h ('k') | no next file » | no next file with comments »