| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
| 9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
| 10 | 10 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 26 | 26 |
| 27 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ | 27 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ |
| 28 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ | 28 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ |
| 29 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ | 29 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ |
| 30 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ | 30 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ |
| 31 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ | 31 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ |
| 32 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ | 32 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ |
| 33 case 31: return op(v, 31); } return fVec | 33 case 31: return op(v, 31); } return fVec |
| 34 | 34 |
| 35 template <> | 35 template <> |
| 36 class SkNf<2, float> { | 36 class SkNf<2> { |
| 37 public: | 37 public: |
| 38 SkNf(float32x2_t vec) : fVec(vec) {} | 38 SkNf(float32x2_t vec) : fVec(vec) {} |
| 39 | 39 |
| 40 SkNf() {} | 40 SkNf() {} |
| 41 explicit SkNf(float val) : fVec(vdup_n_f32(val)) {} | 41 explicit SkNf(float val) : fVec(vdup_n_f32(val)) {} |
| 42 static SkNf Load(const float vals[2]) { return vld1_f32(vals); } | 42 static SkNf Load(const float vals[2]) { return vld1_f32(vals); } |
| 43 SkNf(float a, float b) { fVec = (float32x2_t) { a, b }; } | 43 SkNf(float a, float b) { fVec = (float32x2_t) { a, b }; } |
| 44 | 44 |
| 45 void store(float vals[2]) const { vst1_f32(vals, fVec); } | 45 void store(float vals[2]) const { vst1_f32(vals, fVec); } |
| 46 | 46 |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 106 return vget_lane_u32(v,0) && vget_lane_u32(v,1); | 106 return vget_lane_u32(v,0) && vget_lane_u32(v,1); |
| 107 } | 107 } |
| 108 bool anyTrue() const { | 108 bool anyTrue() const { |
| 109 auto v = vreinterpret_u32_f32(fVec); | 109 auto v = vreinterpret_u32_f32(fVec); |
| 110 return vget_lane_u32(v,0) || vget_lane_u32(v,1); | 110 return vget_lane_u32(v,0) || vget_lane_u32(v,1); |
| 111 } | 111 } |
| 112 | 112 |
| 113 float32x2_t fVec; | 113 float32x2_t fVec; |
| 114 }; | 114 }; |
| 115 | 115 |
| 116 #if defined(SK_CPU_ARM64) | |
| 117 template <> | |
| 118 class SkNf<2, double> { | |
| 119 public: | |
| 120 SkNf(float64x2_t vec) : fVec(vec) {} | |
| 121 | |
| 122 SkNf() {} | |
| 123 explicit SkNf(double val) : fVec(vdupq_n_f64(val)) {} | |
| 124 static SkNf Load(const double vals[2]) { return vld1q_f64(vals); } | |
| 125 SkNf(double a, double b) { fVec = (float64x2_t) { a, b }; } | |
| 126 | |
| 127 void store(double vals[2]) const { vst1q_f64(vals, fVec); } | |
| 128 | |
| 129 SkNf operator + (const SkNf& o) const { return vaddq_f64(fVec, o.fVec); } | |
| 130 SkNf operator - (const SkNf& o) const { return vsubq_f64(fVec, o.fVec); } | |
| 131 SkNf operator * (const SkNf& o) const { return vmulq_f64(fVec, o.fVec); } | |
| 132 SkNf operator / (const SkNf& o) const { return vdivq_f64(fVec, o.fVec); } | |
| 133 | |
| 134 // vreinterpretq_f64_u64 and vreinterpretq_f64_u32 don't seem to exist....
weird. | |
| 135 SkNf operator==(const SkNf& o) const { return (float64x2_t)(vceqq_f64(fVec,
o.fVec)); } | |
| 136 SkNf operator <(const SkNf& o) const { return (float64x2_t)(vcltq_f64(fVec,
o.fVec)); } | |
| 137 SkNf operator >(const SkNf& o) const { return (float64x2_t)(vcgtq_f64(fVec,
o.fVec)); } | |
| 138 SkNf operator<=(const SkNf& o) const { return (float64x2_t)(vcleq_f64(fVec,
o.fVec)); } | |
| 139 SkNf operator>=(const SkNf& o) const { return (float64x2_t)(vcgeq_f64(fVec,
o.fVec)); } | |
| 140 SkNf operator != (const SkNf& o) const { | |
| 141 return (float64x2_t)(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(fVec, o.f
Vec)))); | |
| 142 } | |
| 143 | |
| 144 static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f64(l.fVec, r.f
Vec); } | |
| 145 static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f64(l.fVec, r.f
Vec); } | |
| 146 | |
| 147 SkNf sqrt() const { return vsqrtq_f64(fVec); } | |
| 148 | |
| 149 SkNf rsqrt0() const { return vrsqrteq_f64(fVec); } | |
| 150 SkNf rsqrt1() const { | |
| 151 float64x2_t est0 = this->rsqrt0().fVec; | |
| 152 return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0); | |
| 153 } | |
| 154 SkNf rsqrt2() const { | |
| 155 float64x2_t est1 = this->rsqrt1().fVec; | |
| 156 return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est1, est1)), est1); | |
| 157 } | |
| 158 | |
| 159 SkNf approxInvert() const { | |
| 160 float64x2_t est0 = vrecpeq_f64(fVec), | |
| 161 est1 = vmulq_f64(vrecpsq_f64(est0, fVec), est0); | |
| 162 return est1; | |
| 163 } | |
| 164 | |
| 165 SkNf invert() const { | |
| 166 float64x2_t est1 = this->approxInvert().fVec, | |
| 167 est2 = vmulq_f64(vrecpsq_f64(est1, fVec), est1), | |
| 168 est3 = vmulq_f64(vrecpsq_f64(est2, fVec), est2); | |
| 169 return est3; | |
| 170 } | |
| 171 | |
| 172 template <int k> double kth() const { | |
| 173 SkASSERT(0 <= k && k < 2); | |
| 174 return vgetq_lane_f64(fVec, k&1); | |
| 175 } | |
| 176 | |
| 177 // vreinterpretq_u64_f64 doesn't seem to exist.... weird. | |
| 178 bool allTrue() const { | |
| 179 auto v = (uint64x2_t)(fVec); | |
| 180 return vgetq_lane_u64(v,0) && vgetq_lane_u64(v,1); | |
| 181 } | |
| 182 bool anyTrue() const { | |
| 183 auto v = (uint64x2_t)(fVec); | |
| 184 return vgetq_lane_u64(v,0) || vgetq_lane_u64(v,1); | |
| 185 } | |
| 186 | |
| 187 float64x2_t fVec; | |
| 188 }; | |
| 189 #endif//defined(SK_CPU_ARM64) | |
| 190 | |
| 191 template <> | 116 template <> |
| 192 class SkNi<4, int> { | 117 class SkNi<4, int> { |
| 193 public: | 118 public: |
| 194 SkNi(const int32x4_t& vec) : fVec(vec) {} | 119 SkNi(const int32x4_t& vec) : fVec(vec) {} |
| 195 | 120 |
| 196 SkNi() {} | 121 SkNi() {} |
| 197 explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {} | 122 explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {} |
| 198 static SkNi Load(const int vals[4]) { return vld1q_s32(vals); } | 123 static SkNi Load(const int vals[4]) { return vld1q_s32(vals); } |
| 199 SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } | 124 SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } |
| 200 | 125 |
| 201 void store(int vals[4]) const { vst1q_s32(vals, fVec); } | 126 void store(int vals[4]) const { vst1q_s32(vals, fVec); } |
| 202 | 127 |
| 203 SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); } | 128 SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); } |
| 204 SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); } | 129 SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); } |
| 205 SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); } | 130 SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); } |
| 206 | 131 |
| 207 SkNi operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 132 SkNi operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } |
| 208 SkNi operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 133 SkNi operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } |
| 209 | 134 |
| 210 template <int k> int kth() const { | 135 template <int k> int kth() const { |
| 211 SkASSERT(0 <= k && k < 4); | 136 SkASSERT(0 <= k && k < 4); |
| 212 return vgetq_lane_s32(fVec, k&3); | 137 return vgetq_lane_s32(fVec, k&3); |
| 213 } | 138 } |
| 214 | 139 |
| 215 int32x4_t fVec; | 140 int32x4_t fVec; |
| 216 }; | 141 }; |
| 217 | 142 |
| 218 template <> | 143 template <> |
| 219 class SkNf<4, float> { | 144 class SkNf<4> { |
| 220 public: | 145 public: |
| 221 SkNf(float32x4_t vec) : fVec(vec) {} | 146 SkNf(float32x4_t vec) : fVec(vec) {} |
| 222 | 147 |
| 223 SkNf() {} | 148 SkNf() {} |
| 224 explicit SkNf(float val) : fVec(vdupq_n_f32(val)) {} | 149 explicit SkNf(float val) : fVec(vdupq_n_f32(val)) {} |
| 225 static SkNf Load(const float vals[4]) { return vld1q_f32(vals); } | 150 static SkNf Load(const float vals[4]) { return vld1q_f32(vals); } |
| 226 static SkNf FromBytes(const uint8_t vals[4]) { | 151 static SkNf FromBytes(const uint8_t vals[4]) { |
| 227 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); | 152 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); |
| 228 uint16x8_t fix8_16 = vmovl_u8(fix8); | 153 uint16x8_t fix8_16 = vmovl_u8(fix8); |
| 229 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); | 154 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); |
| 230 return SkNf(vcvtq_f32_u32(fix8_32)); | 155 return SkNf(vcvtq_f32_u32(fix8_32)); |
| 231 } | 156 } |
| 232 | 157 |
| 233 SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 158 SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
| 234 | 159 |
| 235 void store(float vals[4]) const { vst1q_f32(vals, fVec); } | 160 void store(float vals[4]) const { vst1q_f32(vals, fVec); } |
| 236 void toBytes(uint8_t bytes[4]) const { | 161 void toBytes(uint8_t bytes[4]) const { |
| 237 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); | 162 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); |
| 238 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); | 163 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); |
| 239 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); | 164 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); |
| 240 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); | 165 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); |
| 241 } | 166 } |
| 242 | 167 |
| 243 SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); } | |
| 244 | |
| 245 SkNf approxInvert() const { | 168 SkNf approxInvert() const { |
| 246 float32x4_t est0 = vrecpeq_f32(fVec), | 169 float32x4_t est0 = vrecpeq_f32(fVec), |
| 247 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 170 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
| 248 return est1; | 171 return est1; |
| 249 } | 172 } |
| 250 SkNf invert() const { | 173 SkNf invert() const { |
| 251 float32x4_t est1 = this->approxInvert().fVec, | 174 float32x4_t est1 = this->approxInvert().fVec, |
| 252 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | 175 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); |
| 253 return est2; | 176 return est2; |
| 254 } | 177 } |
| (...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 392 uint8x16_t fVec; | 315 uint8x16_t fVec; |
| 393 }; | 316 }; |
| 394 | 317 |
| 395 #undef SHIFT32 | 318 #undef SHIFT32 |
| 396 #undef SHIFT16 | 319 #undef SHIFT16 |
| 397 #undef SHIFT8 | 320 #undef SHIFT8 |
| 398 | 321 |
| 399 } // namespace | 322 } // namespace |
| 400 | 323 |
| 401 #endif//SkNx_neon_DEFINED | 324 #endif//SkNx_neon_DEFINED |
| OLD | NEW |