| Index: src/opts/SkNx_neon.h | 
| diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h | 
| index cdc4615849556ff185afd59f15c82e8d5a5320b5..641e9d296b54014e124ab306a24df328bc1544f9 100644 | 
| --- a/src/opts/SkNx_neon.h | 
| +++ b/src/opts/SkNx_neon.h | 
| @@ -11,14 +11,13 @@ | 
| #define SKNX_IS_FAST | 
|  | 
| // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it: | 
| -//   - round by adding (1<<23) with our sign, then subtracting it; | 
| -//   - if that rounded value is bigger than our input, subtract 1. | 
| +//   - roundtrip through integers via truncation | 
| +//   - subtract 1 if that's too big (possible for negative values). | 
| +// This restricts the domain of our inputs to a maximum somehwere around 2^31.  Seems plenty big. | 
| static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 
| -    auto sign = vandq_u32((uint32x4_t)v, vdupq_n_u32(1<<31)); | 
| -    auto bias = (float32x4_t)(vorrq_u32((uint32x4_t)vdupq_n_f32(1<<23), sign)); | 
| -    auto rounded = vsubq_f32(vaddq_f32(v, bias), bias); | 
| -    auto too_big = vcgtq_f32(rounded, v); | 
| -    return vsubq_f32(rounded, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); | 
| +    auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 
| +    auto too_big = vcgtq_f32(roundtrip, v); | 
| +    return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); | 
| } | 
|  | 
| // Well, this is absurd.  The shifts require compile-time constant arguments. | 
|  |