| Index: src/opts/Sk4x_neon.h
|
| diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
|
| index 2851fb31a4e7525f6885d89a2410fde3fa05018d..3f35fe785b9af2c39ae9ca948e5cacee698ca41f 100644
|
| --- a/src/opts/Sk4x_neon.h
|
| +++ b/src/opts/Sk4x_neon.h
|
| @@ -82,13 +82,17 @@ M(Sk4f) divide (const Sk4f& o) const {
|
| }
|
|
|
| M(Sk4f) rsqrt() const {
|
| - float32x4_t est0 = vrsqrteq_f32(fVec);
|
| - float32x4_t est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
|
| - float32x4_t est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
|
| - return est2;
|
| + float32x4_t est0 = vrsqrteq_f32(fVec),
|
| + est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
|
| + return est1;
|
| }
|
|
|
| -M(Sk4f) sqrt() const { return this->multiply(this->rsqrt()); }
|
| +M(Sk4f) sqrt() const {
|
| + float32x4_t est1 = this->rsqrt().fVec,
|
| + // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
|
| + est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
|
| + return vmulq_f32(fVec, est2);
|
| +}
|
|
|
| M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); }
|
| M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); }
|
|
|