src/core/Sk4x_neon.h - Issue 975303003: 4x library for NEON

Side by Side Diff: src/core/Sk4x_neon.h

Issue 975303003: 4x library for NEON (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: added iteration steps to imprecise calcs 2 Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // It is important _not_ to put header guards here.

	2 // This file will be intentionally included three times.

	3

	4 #if defined(SK4X_PREAMBLE)

	5 #include <arm_neon.h>

	6

	7 // Template metaprogramming to map scalar types to vector types.

	8 template <typename T> struct SkScalarToSIMD;

	9 template <> struct SkScalarToSIMD<float> { typedef float32x4_t Type; };

	10 template <> struct SkScalarToSIMD<int32_t> { typedef int32x4_t Type; };

	11

	12 #elif defined(SK4X_PRIVATE)

	13 Sk4x(float32x4_t);

	14 Sk4x(int32x4_t);

	15

	16 typename SkScalarToSIMD<T>::Type fVec;

	17

	18 #else

	19

	20 // Vector Constructors

	21 //template <> inline Sk4f::Sk4x(int32x4_t v) : fVec(vcvtq_f32_s32(v)) {}

	22 template <> inline Sk4f::Sk4x(float32x4_t v) : fVec(v) {}

	23 template <> inline Sk4i::Sk4x(int32x4_t v) : fVec(v) {}

	24 //template <> inline Sk4i::Sk4x(float32x4_t v) : fVec(vcvtq_s32_f32(v)) {}

	25

	26 // Generic Methods

	27 template <typename T> Sk4x<T>::Sk4x() {}

	28 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; }

	29 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {

	30 fVec = other.fVec;

	31 return *this;

	32 }

	33

	34 // Sk4f Methods

	35 #define M(...) template <> inline __VA_ARGS__ Sk4f::

	36

	37 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {}

	38 M() Sk4x(float a, float b, float c, float d) {

	39 // NEON lacks an intrinsic to make this easy. It is recommended to avoid

	40 // this constructor unless it is absolutely necessary.

	41

	42 // I am choosing to use the set lane intrinsics. Particularly, in the case

	43 // of floating point, it is likely that the values are already in the right

	44 // register file, so this may be the best approach. However, I am not

	45 // certain that this is the fastest approach and experimentation might be

	46 // useful.

	47 fVec = vsetq_lane_f32(a, fVec, 0);

	48 fVec = vsetq_lane_f32(b, fVec, 1);

	49 fVec = vsetq_lane_f32(c, fVec, 2);

	50 fVec = vsetq_lane_f32(d, fVec, 3);

	51 }

	52

	53 // As far as I can tell, it's not possible to provide an alignment hint to

	54 // NEON using intrinsics. However, I think it is possible at the assembly

	55 // level if we want to get into that.

	56 // TODO: Write our own aligned load and store.

	57 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); }

	58 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); }

	59 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); }

	60 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); }

	61

	62 template <>

	63 M(Sk4i) reinterpret<Sk4i>() const { return vreinterpretq_s32_f32(fVec); }

	64

	65 template <>

	66 M(Sk4i) cast<Sk4i>() const { return vcvtq_s32_f32(fVec); }

	67

	68 // We're going to skip allTrue(), anyTrue(), and bit-manipulators

	69 // for Sk4f. Code that calls them probably does so accidentally.

	70 // Ask msarett or mtklein to fill these in if you really need them.

	71 M(Sk4f) add (const Sk4f& o) const { return vaddq_f32(fVec, o.fVec); }

	72 M(Sk4f) subtract(const Sk4f& o) const { return vsubq_f32(fVec, o.fVec); }

	73 M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }

	74

	75 M(Sk4f) divide (const Sk4f& o) const {

	76 float32x4_t est0 = vrecpeq_f32(o.fVec);

	77 float32x4_t est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0);

	78 float32x4_t est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);

	79 return vmulq_f32(est2, fVec);

	80 }

	81

	82 M(Sk4f) rsqrt() const {

	83 float32x4_t est0 = vrsqrteq_f32(fVec);

	84 float32x4_t est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0 );

	85 float32x4_t est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1 );

	86 return est2;

	87 }

	88

	89 M(Sk4f) sqrt() const { return this->multiply(this->rsqrt()); }

	90

	91 M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vce qq_f32(fVec, o.fVec)); }

	92 M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmv nq_u32(vceqq_f32(fVec, o.fVec))); }

	93 M(Sk4i) lessThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl tq_f32(fVec, o.fVec)); }

	94 M(Sk4i) greaterThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcg tq_f32(fVec, o.fVec)); }

	95 M(Sk4i) lessThanEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl eq_f32(fVec, o.fVec)); }

	96 M(Sk4i) greaterThanEqual(const Sk4f& o) const { return vreinterpretq_s32_u32(vcg eq_f32(fVec, o.fVec)); }

	97

	98 M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return vminq_f32(a.fVec, b.fVec); }

	99 M(Sk4f) Max(const Sk4f& a, const Sk4f& b) { return vmaxq_f32(a.fVec, b.fVec); }

	100

	101 // These shuffle operations are implemented more efficiently with SSE.

	102 // NEON has efficient zip, unzip, and transpose, but it is more costly to

	103 // exploit zip and unzip in order to shuffle.

	104 M(Sk4f) zwxy() const {

	105 float32x4x2_t zip = vzipq_f32(fVec, vdupq_n_f32(0.0));

	106 return vuzpq_f32(zip.val[1], zip.val[0]).val[0];

	107 }

	108 // Note that XYAB and ZWCD share code. If both are needed, they could be

	109 // implemented more efficiently together. Also, ABXY and CDZW are available

	110 // as well.

	111 M(Sk4f) XYAB(const Sk4f& xyzw, const Sk4f& abcd) {

	112 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);

	113 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);

	114 return vuzpq_f32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];

	115 }

	116 M(Sk4f) ZWCD(const Sk4f& xyzw, const Sk4f& abcd) {

	117 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);

	118 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);

	119 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];

	120 }

	121

	122 // Sk4i Methods

	123 #undef M

	124 #define M(...) template <> inline __VA_ARGS__ Sk4i::

	125

	126 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {}

	127 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) {

	128 // NEON lacks an intrinsic to make this easy. It is recommended to avoid

	129 // this constructor unless it is absolutely necessary.

	130

	131 // There are a few different implementation strategies.

	132

	133 // uint64_t ab_i = ((uint32_t) a) \| (((uint64_t) b) << 32);

	134 // uint64_t cd_i = ((uint32_t) c) \| (((uint64_t) d) << 32);

	135 // int32x2_t ab = vcreate_s32(ab_i);

	136 // int32x2_t cd = vcreate_s32(cd_i);

	137 // fVec = vcombine_s32(ab, cd);

	138 // This might not be a bad idea for the integer case. Either way I think,

	139 // we will need to move values from general registers to NEON registers.

	140

	141 // I am choosing to use the set lane intrinsics. I am not certain that

	142 // this is the fastest approach. It may be useful to try the above code

	143 // for integers.

	144 fVec = vsetq_lane_s32(a, fVec, 0);

	145 fVec = vsetq_lane_s32(b, fVec, 1);

	146 fVec = vsetq_lane_s32(c, fVec, 2);

	147 fVec = vsetq_lane_s32(d, fVec, 3);

	148 }

	149

	150 // As far as I can tell, it's not possible to provide an alignment hint to

	151 // NEON using intrinsics. However, I think it is possible at the assembly

	152 // level if we want to get into that.

	153 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); }

	154 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); }

	155 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); }

	156 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); }

	157

	158 template <>

	159 M(Sk4f) reinterpret<Sk4f>() const { return vreinterpretq_f32_s32(fVec); }

	160

	161 template <>

	162 M(Sk4f) cast<Sk4f>() const { return vcvtq_f32_s32(fVec); }

	163

	164 M(bool) allTrue() const {

	165 int32_t a = vgetq_lane_s32(fVec, 0);

	166 int32_t b = vgetq_lane_s32(fVec, 1);

	167 int32_t c = vgetq_lane_s32(fVec, 2);

	168 int32_t d = vgetq_lane_s32(fVec, 3);

	169 return a & b & c & d;

	170 }

	171 M(bool) anyTrue() const {

	172 int32_t a = vgetq_lane_s32(fVec, 0);

	173 int32_t b = vgetq_lane_s32(fVec, 1);

	174 int32_t c = vgetq_lane_s32(fVec, 2);

	175 int32_t d = vgetq_lane_s32(fVec, 3);

	176 return a \| b \| c \| d;

	177 }

	178

	179 M(Sk4i) bitNot() const { return vmvnq_s32(fVec); }

	180 M(Sk4i) bitAnd(const Sk4i& o) const { return vandq_s32(fVec, o.fVec); }

	181 M(Sk4i) bitOr (const Sk4i& o) const { return vorrq_s32(fVec, o.fVec); }

	182

	183 M(Sk4i) equal (const Sk4i& o) const { return vreinterpretq_s32_u32(vce qq_s32(fVec, o.fVec)); }

	184 M(Sk4i) notEqual (const Sk4i& o) const { return vreinterpretq_s32_u32(vmv nq_u32(vceqq_s32(fVec, o.fVec))); }

	185 M(Sk4i) lessThan (const Sk4i& o) const { return vreinterpretq_s32_u32(vcl tq_s32(fVec, o.fVec)); }

	186 M(Sk4i) greaterThan (const Sk4i& o) const { return vreinterpretq_s32_u32(vcg tq_s32(fVec, o.fVec)); }

	187 M(Sk4i) lessThanEqual (const Sk4i& o) const { return vreinterpretq_s32_u32(vcl eq_s32(fVec, o.fVec)); }

	188 M(Sk4i) greaterThanEqual(const Sk4i& o) const { return vreinterpretq_s32_u32(vcg eq_s32(fVec, o.fVec)); }

	189

	190 M(Sk4i) add (const Sk4i& o) const { return vaddq_s32(fVec, o.fVec); }

	191 M(Sk4i) subtract(const Sk4i& o) const { return vsubq_s32(fVec, o.fVec); }

	192 M(Sk4i) multiply(const Sk4i& o) const { return vmulq_s32(fVec, o.fVec); }

	193 // NEON does not have integer reciprocal, sqrt, or division.

	194 M(Sk4i) Min(const Sk4i& a, const Sk4i& b) { return vminq_s32(a.fVec, b.fVec); }

	195 M(Sk4i) Max(const Sk4i& a, const Sk4i& b) { return vmaxq_s32(a.fVec, b.fVec); }

	196

	197 // These shuffle operations are implemented more efficiently with SSE.

	198 // NEON has efficient zip, unzip, and transpose, but it is more costly to

	199 // exploit zip and unzip in order to shuffle.

	200 M(Sk4i) zwxy() const {

	201 int32x4x2_t zip = vzipq_s32(fVec, vdupq_n_s32(0.0));

	202 return vuzpq_s32(zip.val[1], zip.val[0]).val[0];

	203 }

	204 // Note that XYAB and ZWCD share code. If both are needed, they could be

	205 // implemented more efficiently together. Also, ABXY and CDZW are available

	206 // as well.

	207 M(Sk4i) XYAB(const Sk4i& xyzw, const Sk4i& abcd) {

	208 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);

	209 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);

	210 return vuzpq_s32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];

	211 }

	212 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {

	213 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);

	214 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);

	215 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];

	216 }

	217

	218 #undef M

	219

	220 #endif

OLD	NEW

« no previous file with comments | « src/core/Sk4x.h ('k') | tests/Sk4xTest.cpp » ('j') | tests/Sk4xTest.cpp » ('J')