Index: src/opts/Sk4x_neon.h |
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h |
index 2851fb31a4e7525f6885d89a2410fde3fa05018d..3f35fe785b9af2c39ae9ca948e5cacee698ca41f 100644 |
--- a/src/opts/Sk4x_neon.h |
+++ b/src/opts/Sk4x_neon.h |
@@ -82,13 +82,17 @@ M(Sk4f) divide (const Sk4f& o) const { |
} |
M(Sk4f) rsqrt() const { |
- float32x4_t est0 = vrsqrteq_f32(fVec); |
- float32x4_t est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); |
- float32x4_t est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); |
- return est2; |
+ float32x4_t est0 = vrsqrteq_f32(fVec), |
+ est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); |
+ return est1; |
} |
-M(Sk4f) sqrt() const { return this->multiply(this->rsqrt()); } |
+M(Sk4f) sqrt() const { |
+ float32x4_t est1 = this->rsqrt().fVec, |
+ // An extra step of Newton's method to refine the estimate of 1/sqrt(this). |
+ est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); |
+ return vmulq_f32(fVec, est2); |
+} |
M(Sk4i) equal (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); } |
M(Sk4i) notEqual (const Sk4f& o) const { return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); } |