Index: src/opts/SkNx_neon.h |
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h |
index 2cb8eb348d448e1aa9af4685c9739fb9123f46e7..cdc4615849556ff185afd59f15c82e8d5a5320b5 100644 |
--- a/src/opts/SkNx_neon.h |
+++ b/src/opts/SkNx_neon.h |
@@ -10,6 +10,17 @@ |
#define SKNX_IS_FAST |
+// ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: |
+// - round by adding (1<<23) with our sign, then subtracting it; |
+// - if that rounded value is bigger than our input, subtract 1. |
+static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { |
+ auto sign = vandq_u32((uint32x4_t)v, vdupq_n_u32(1<<31)); |
+ auto bias = (float32x4_t)(vorrq_u32((uint32x4_t)vdupq_n_f32(1<<23), sign)); |
+ auto rounded = vsubq_f32(vaddq_f32(v, bias), bias); |
+ auto too_big = vcgtq_f32(rounded, v); |
+ return vsubq_f32(rounded, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); |
+} |
+ |
// Well, this is absurd. The shifts require compile-time constant arguments. |
#define SHIFT8(op, v, bits) switch(bits) { \ |
@@ -161,6 +172,14 @@ public: |
static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); } |
SkNx abs() const { return vabsq_f32(fVec); } |
+ SkNx floor() const { |
+ #if defined(SK_CPU_ARM64) |
+ return vrndmq_f32(fVec); |
+ #else |
+ return armv7_vrndmq_f32(fVec); |
+ #endif |
+ } |
+ |
SkNx rsqrt0() const { return vrsqrteq_f32(fVec); } |
SkNx rsqrt1() const { |