| Index: src/opts/Sk4x_neon.h
|
| diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
|
| index 3f35fe785b9af2c39ae9ca948e5cacee698ca41f..c86fdea6894dbe285c48bd477f405d14c09feb90 100644
|
| --- a/src/opts/Sk4x_neon.h
|
| +++ b/src/opts/Sk4x_neon.h
|
| @@ -75,10 +75,14 @@ M(Sk4f) subtract(const Sk4f& o) const { return vsubq_f32(fVec, o.fVec); }
|
| M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }
|
|
|
| M(Sk4f) divide (const Sk4f& o) const {
|
| - float32x4_t est0 = vrecpeq_f32(o.fVec);
|
| - float32x4_t est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0);
|
| - float32x4_t est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
|
| +#if defined(SK_CPU_ARM64)
|
| + return vdivq_f32(fVec, o.fVec);
|
| +#else
|
| + float32x4_t est0 = vrecpeq_f32(o.fVec),
|
| + est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
|
| + est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
|
| return vmulq_f32(est2, fVec);
|
| +#endif
|
| }
|
|
|
| M(Sk4f) rsqrt() const {
|
| @@ -88,10 +92,14 @@ M(Sk4f) rsqrt() const {
|
| }
|
|
|
| M(Sk4f) sqrt() const {
|
| +#if defined(SK_CPU_ARM64)
|
| + return vsqrtq_f32(fVec);
|
| +#else
|
| float32x4_t est1 = this->rsqrt().fVec,
|
| // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
|
| est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
|
| return vmulq_f32(fVec, est2);
|
| +#endif
|
| }
|
|
|
| M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); }
|
|
|