src/opts/Sk4x_neon.h - Issue 1027753003: Add divide to Sk2x, use native vdiv and vsqrt on ARM 64.

Side by Side Diff: src/opts/Sk4x_neon.h

Issue 1027753003: Add divide to Sk2x, use native vdiv and vsqrt on ARM 64. (Closed) Base URL: https://skia.googlesource.com/skia@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // It is important _not_ to put header guards here.	1 // It is important _not_ to put header guards here.

2 // This file will be intentionally included three times.	2 // This file will be intentionally included three times.

3	3

4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362	4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362

5	5

6 #if defined(SK4X_PREAMBLE)	6 #if defined(SK4X_PREAMBLE)

7 #include <arm_neon.h>	7 #include <arm_neon.h>

8	8

9 // Template metaprogramming to map scalar types to vector types.	9 // Template metaprogramming to map scalar types to vector types.

10 template <typename T> struct SkScalarToSIMD;	10 template <typename T> struct SkScalarToSIMD;

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
68 M(Sk4i) cast<Sk4i>() const { return vcvtq_s32_f32(fVec); }	68 M(Sk4i) cast<Sk4i>() const { return vcvtq_s32_f32(fVec); }

69	69

70 // We're going to skip allTrue(), anyTrue(), and bit-manipulators	70 // We're going to skip allTrue(), anyTrue(), and bit-manipulators

71 // for Sk4f. Code that calls them probably does so accidentally.	71 // for Sk4f. Code that calls them probably does so accidentally.

72 // Ask msarett or mtklein to fill these in if you really need them.	72 // Ask msarett or mtklein to fill these in if you really need them.

73 M(Sk4f) add (const Sk4f& o) const { return vaddq_f32(fVec, o.fVec); }	73 M(Sk4f) add (const Sk4f& o) const { return vaddq_f32(fVec, o.fVec); }

74 M(Sk4f) subtract(const Sk4f& o) const { return vsubq_f32(fVec, o.fVec); }	74 M(Sk4f) subtract(const Sk4f& o) const { return vsubq_f32(fVec, o.fVec); }

75 M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }	75 M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }

76	76

77 M(Sk4f) divide (const Sk4f& o) const {	77 M(Sk4f) divide (const Sk4f& o) const {

78 float32x4_t est0 = vrecpeq_f32(o.fVec);	78 #if defined(SK_CPU_ARM64)

79 float32x4_t est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0);	79 return vdivq_f32(fVec, o.fVec);

80 float32x4_t est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);	80 #else

	81 float32x4_t est0 = vrecpeq_f32(o.fVec),

	82 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),

	83 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);

81 return vmulq_f32(est2, fVec);	84 return vmulq_f32(est2, fVec);

	85 #endif

82 }	86 }

83	87

84 M(Sk4f) rsqrt() const {	88 M(Sk4f) rsqrt() const {

85 float32x4_t est0 = vrsqrteq_f32(fVec),	89 float32x4_t est0 = vrsqrteq_f32(fVec),

86 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0 );	90 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0 );

87 return est1;	91 return est1;

88 }	92 }

89	93

90 M(Sk4f) sqrt() const {	94 M(Sk4f) sqrt() const {

	95 #if defined(SK_CPU_ARM64)

	96 return vsqrtq_f32(fVec);

	97 #else

91 float32x4_t est1 = this->rsqrt().fVec,	98 float32x4_t est1 = this->rsqrt().fVec,

92 // An extra step of Newton's method to refine the estimate of 1/sqrt(this).	99 // An extra step of Newton's method to refine the estimate of 1/sqrt(this).

93 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1 );	100 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1 );

94 return vmulq_f32(fVec, est2);	101 return vmulq_f32(fVec, est2);

	102 #endif

95 }	103 }

96	104

97 M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vce qq_f32(fVec, o.fVec)); }	105 M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vce qq_f32(fVec, o.fVec)); }

98 M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmv nq_u32(vceqq_f32(fVec, o.fVec))); }	106 M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmv nq_u32(vceqq_f32(fVec, o.fVec))); }

99 M(Sk4i) lessThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl tq_f32(fVec, o.fVec)); }	107 M(Sk4i) lessThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl tq_f32(fVec, o.fVec)); }

100 M(Sk4i) greaterThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcg tq_f32(fVec, o.fVec)); }	108 M(Sk4i) greaterThan (const Sk4f& o) const { return vreinterpretq_s32_u32(vcg tq_f32(fVec, o.fVec)); }

101 M(Sk4i) lessThanEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl eq_f32(fVec, o.fVec)); }	109 M(Sk4i) lessThanEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vcl eq_f32(fVec, o.fVec)); }

102 M(Sk4i) greaterThanEqual(const Sk4f& o) const { return vreinterpretq_s32_u32(vcg eq_f32(fVec, o.fVec)); }	110 M(Sk4i) greaterThanEqual(const Sk4f& o) const { return vreinterpretq_s32_u32(vcg eq_f32(fVec, o.fVec)); }

103	111

104 M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return vminq_f32(a.fVec, b.fVec); }	112 M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return vminq_f32(a.fVec, b.fVec); }

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
217 }	225 }

218 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {	226 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {

219 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);	227 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);

220 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);	228 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);

221 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];	229 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];

222 }	230 }

223	231

224 #undef M	232 #undef M

225	233

226 #endif	234 #endif

OLD	NEW

« no previous file with comments | « src/opts/Sk2x_sse.h ('k') | tests/Sk2xTest.cpp » ('j') | no next file with comments »