| Index: src/opts/SkNx_neon.h
|
| diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
|
| index a4b7cd1a731045380619052d9e5b945f3779fdd1..2cb8eb348d448e1aa9af4685c9739fb9123f46e7 100644
|
| --- a/src/opts/SkNx_neon.h
|
| +++ b/src/opts/SkNx_neon.h
|
| @@ -10,8 +10,6 @@
|
|
|
| #define SKNX_IS_FAST
|
|
|
| -namespace { // See SkNx.h
|
| -
|
| // Well, this is absurd. The shifts require compile-time constant arguments.
|
|
|
| #define SHIFT8(op, v, bits) switch(bits) { \
|
| @@ -98,10 +96,12 @@ public:
|
| #endif
|
| }
|
|
|
| - template <int k> float kth() const {
|
| + float operator[](int k) const {
|
| SkASSERT(0 <= k && k < 2);
|
| - return vget_lane_f32(fVec, k&1);
|
| + union { float32x2_t v; float fs[2]; } pun = {fVec};
|
| + return pun.fs[k&1];
|
| }
|
| + template <int k> float kth() const { return (*this)[k]; }
|
|
|
| bool allTrue() const {
|
| auto v = vreinterpret_u32_f32(fVec);
|
| @@ -116,33 +116,6 @@ public:
|
| };
|
|
|
| template <>
|
| -class SkNx<4, int> {
|
| -public:
|
| - SkNx(const int32x4_t& vec) : fVec(vec) {}
|
| -
|
| - SkNx() {}
|
| - SkNx(int val) : fVec(vdupq_n_s32(val)) {}
|
| - static SkNx Load(const void* ptr) { return vld1q_s32((const int*)ptr); }
|
| - SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
|
| -
|
| - void store(void* ptr) const { vst1q_s32((int*)ptr, fVec); }
|
| -
|
| - SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
|
| - SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
|
| - SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
|
| -
|
| - SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
|
| - SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
|
| -
|
| - template <int k> int kth() const {
|
| - SkASSERT(0 <= k && k < 4);
|
| - return vgetq_lane_s32(fVec, k&3);
|
| - }
|
| -
|
| - int32x4_t fVec;
|
| -};
|
| -
|
| -template <>
|
| class SkNx<4, float> {
|
| public:
|
| SkNx(float32x4_t vec) : fVec(vec) {}
|
| @@ -207,10 +180,12 @@ public:
|
| #endif
|
| }
|
|
|
| - template <int k> float kth() const {
|
| + float operator[](int k) const {
|
| SkASSERT(0 <= k && k < 4);
|
| - return vgetq_lane_f32(fVec, k&3);
|
| + union { float32x4_t v; float fs[4]; } pun = {fVec};
|
| + return pun.fs[k&3];
|
| }
|
| + template <int k> float kth() const { return (*this)[k]; }
|
|
|
| bool allTrue() const {
|
| auto v = vreinterpretq_u32_f32(fVec);
|
| @@ -257,10 +232,12 @@ public:
|
|
|
| static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
|
|
|
| - template <int k> uint16_t kth() const {
|
| + uint16_t operator[](int k) const {
|
| SkASSERT(0 <= k && k < 4);
|
| - return vget_lane_u16(fVec, k&3);
|
| + union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
|
| + return pun.us[k&3];
|
| }
|
| + template <int k> uint16_t kth() const { return (*this)[k]; }
|
|
|
| SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
| return vbsl_u16(fVec, t.fVec, e.fVec);
|
| @@ -294,10 +271,12 @@ public:
|
|
|
| static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
|
|
|
| - template <int k> uint16_t kth() const {
|
| + uint16_t operator[](int k) const {
|
| SkASSERT(0 <= k && k < 8);
|
| - return vgetq_lane_u16(fVec, k&7);
|
| + union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
|
| + return pun.us[k&7];
|
| }
|
| + template <int k> uint16_t kth() const { return (*this)[k]; }
|
|
|
| SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
| return vbslq_u16(fVec, t.fVec, e.fVec);
|
| @@ -350,10 +329,12 @@ public:
|
| static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
|
| SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
|
|
|
| - template <int k> uint8_t kth() const {
|
| - SkASSERT(0 <= k && k < 15);
|
| - return vgetq_lane_u8(fVec, k&16);
|
| + uint8_t operator[](int k) const {
|
| + SkASSERT(0 <= k && k < 16);
|
| + union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
|
| + return pun.us[k&15];
|
| }
|
| + template <int k> uint8_t kth() const { return (*this)[k]; }
|
|
|
| SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
| return vbslq_u8(fVec, t.fVec, e.fVec);
|
| @@ -366,17 +347,13 @@ public:
|
| #undef SHIFT16
|
| #undef SHIFT8
|
|
|
| -template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) {
|
| - return vcvtq_s32_f32(src.fVec);
|
| -}
|
| -
|
| -template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) {
|
| +template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
|
| uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
|
| uint16x4_t _16 = vqmovn_u32(_32);
|
| return vqmovn_u16(vcombine_u16(_16, _16));
|
| }
|
|
|
| -template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) {
|
| +template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
|
| uint16x8_t _16 = vmovl_u8 (src.fVec) ;
|
| uint32x4_t _32 = vmovl_u16(vget_low_u16(_16));
|
| return vcvtq_f32_u32(_32);
|
| @@ -390,14 +367,12 @@ static inline void Sk4f_ToBytes(uint8_t bytes[16],
|
| (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]);
|
| }
|
|
|
| -template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) {
|
| +template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
|
| return vget_low_u16(vmovl_u8(src.fVec));
|
| }
|
|
|
| -template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) {
|
| +template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
|
| return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
|
| }
|
|
|
| -} // namespace
|
| -
|
| #endif//SkNx_neon_DEFINED
|
|
|