| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
| 9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
| 10 | 10 |
| 11 #define SKNX_IS_FAST | 11 #define SKNX_IS_FAST |
| 12 | 12 |
| 13 namespace { // See SkNx.h | |
| 14 | |
| 15 // Well, this is absurd. The shifts require compile-time constant arguments. | 13 // Well, this is absurd. The shifts require compile-time constant arguments. |
| 16 | 14 |
| 17 #define SHIFT8(op, v, bits) switch(bits) { \ | 15 #define SHIFT8(op, v, bits) switch(bits) { \ |
| 18 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ | 16 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ |
| 19 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ | 17 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ |
| 20 case 7: return op(v, 7); \ | 18 case 7: return op(v, 7); \ |
| 21 } return fVec | 19 } return fVec |
| 22 | 20 |
| 23 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ | 21 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ |
| 24 case 8: return op(v, 8); case 9: return op(v
, 9); \ | 22 case 8: return op(v, 8); case 9: return op(v
, 9); \ |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 91 } | 89 } |
| 92 | 90 |
| 93 SkNx sqrt() const { | 91 SkNx sqrt() const { |
| 94 #if defined(SK_CPU_ARM64) | 92 #if defined(SK_CPU_ARM64) |
| 95 return vsqrt_f32(fVec); | 93 return vsqrt_f32(fVec); |
| 96 #else | 94 #else |
| 97 return *this * this->rsqrt2(); | 95 return *this * this->rsqrt2(); |
| 98 #endif | 96 #endif |
| 99 } | 97 } |
| 100 | 98 |
| 101 template <int k> float kth() const { | 99 float operator[](int k) const { |
| 102 SkASSERT(0 <= k && k < 2); | 100 SkASSERT(0 <= k && k < 2); |
| 103 return vget_lane_f32(fVec, k&1); | 101 union { float32x2_t v; float fs[2]; } pun = {fVec}; |
| 102 return pun.fs[k&1]; |
| 104 } | 103 } |
| 104 template <int k> float kth() const { return (*this)[k]; } |
| 105 | 105 |
| 106 bool allTrue() const { | 106 bool allTrue() const { |
| 107 auto v = vreinterpret_u32_f32(fVec); | 107 auto v = vreinterpret_u32_f32(fVec); |
| 108 return vget_lane_u32(v,0) && vget_lane_u32(v,1); | 108 return vget_lane_u32(v,0) && vget_lane_u32(v,1); |
| 109 } | 109 } |
| 110 bool anyTrue() const { | 110 bool anyTrue() const { |
| 111 auto v = vreinterpret_u32_f32(fVec); | 111 auto v = vreinterpret_u32_f32(fVec); |
| 112 return vget_lane_u32(v,0) || vget_lane_u32(v,1); | 112 return vget_lane_u32(v,0) || vget_lane_u32(v,1); |
| 113 } | 113 } |
| 114 | 114 |
| 115 float32x2_t fVec; | 115 float32x2_t fVec; |
| 116 }; | 116 }; |
| 117 | 117 |
| 118 template <> | 118 template <> |
| 119 class SkNx<4, int> { | |
| 120 public: | |
| 121 SkNx(const int32x4_t& vec) : fVec(vec) {} | |
| 122 | |
| 123 SkNx() {} | |
| 124 SkNx(int val) : fVec(vdupq_n_s32(val)) {} | |
| 125 static SkNx Load(const void* ptr) { return vld1q_s32((const int*)ptr); } | |
| 126 SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } | |
| 127 | |
| 128 void store(void* ptr) const { vst1q_s32((int*)ptr, fVec); } | |
| 129 | |
| 130 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } | |
| 131 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } | |
| 132 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } | |
| 133 | |
| 134 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | |
| 135 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | |
| 136 | |
| 137 template <int k> int kth() const { | |
| 138 SkASSERT(0 <= k && k < 4); | |
| 139 return vgetq_lane_s32(fVec, k&3); | |
| 140 } | |
| 141 | |
| 142 int32x4_t fVec; | |
| 143 }; | |
| 144 | |
| 145 template <> | |
| 146 class SkNx<4, float> { | 119 class SkNx<4, float> { |
| 147 public: | 120 public: |
| 148 SkNx(float32x4_t vec) : fVec(vec) {} | 121 SkNx(float32x4_t vec) : fVec(vec) {} |
| 149 | 122 |
| 150 SkNx() {} | 123 SkNx() {} |
| 151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 124 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
| 152 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } | 125 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } |
| 153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 126 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
| 154 | 127 |
| 155 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } | 128 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 200 } | 173 } |
| 201 | 174 |
| 202 SkNx sqrt() const { | 175 SkNx sqrt() const { |
| 203 #if defined(SK_CPU_ARM64) | 176 #if defined(SK_CPU_ARM64) |
| 204 return vsqrtq_f32(fVec); | 177 return vsqrtq_f32(fVec); |
| 205 #else | 178 #else |
| 206 return *this * this->rsqrt2(); | 179 return *this * this->rsqrt2(); |
| 207 #endif | 180 #endif |
| 208 } | 181 } |
| 209 | 182 |
| 210 template <int k> float kth() const { | 183 float operator[](int k) const { |
| 211 SkASSERT(0 <= k && k < 4); | 184 SkASSERT(0 <= k && k < 4); |
| 212 return vgetq_lane_f32(fVec, k&3); | 185 union { float32x4_t v; float fs[4]; } pun = {fVec}; |
| 186 return pun.fs[k&3]; |
| 213 } | 187 } |
| 188 template <int k> float kth() const { return (*this)[k]; } |
| 214 | 189 |
| 215 bool allTrue() const { | 190 bool allTrue() const { |
| 216 auto v = vreinterpretq_u32_f32(fVec); | 191 auto v = vreinterpretq_u32_f32(fVec); |
| 217 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) | 192 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) |
| 218 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); | 193 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); |
| 219 } | 194 } |
| 220 bool anyTrue() const { | 195 bool anyTrue() const { |
| 221 auto v = vreinterpretq_u32_f32(fVec); | 196 auto v = vreinterpretq_u32_f32(fVec); |
| 222 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) | 197 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) |
| 223 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); | 198 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); |
| (...skipping 26 matching lines...) Expand all Loading... |
| 250 | 225 |
| 251 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } | 226 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } |
| 252 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } | 227 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } |
| 253 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } | 228 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } |
| 254 | 229 |
| 255 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 230 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } |
| 256 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 231 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } |
| 257 | 232 |
| 258 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 233 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
| 259 | 234 |
| 260 template <int k> uint16_t kth() const { | 235 uint16_t operator[](int k) const { |
| 261 SkASSERT(0 <= k && k < 4); | 236 SkASSERT(0 <= k && k < 4); |
| 262 return vget_lane_u16(fVec, k&3); | 237 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec}; |
| 238 return pun.us[k&3]; |
| 263 } | 239 } |
| 240 template <int k> uint16_t kth() const { return (*this)[k]; } |
| 264 | 241 |
| 265 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 242 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 266 return vbsl_u16(fVec, t.fVec, e.fVec); | 243 return vbsl_u16(fVec, t.fVec, e.fVec); |
| 267 } | 244 } |
| 268 | 245 |
| 269 uint16x4_t fVec; | 246 uint16x4_t fVec; |
| 270 }; | 247 }; |
| 271 | 248 |
| 272 template <> | 249 template <> |
| 273 class SkNx<8, uint16_t> { | 250 class SkNx<8, uint16_t> { |
| (...skipping 13 matching lines...) Expand all Loading... |
| 287 | 264 |
| 288 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } | 265 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } |
| 289 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } | 266 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } |
| 290 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } | 267 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } |
| 291 | 268 |
| 292 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 269 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } |
| 293 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 270 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } |
| 294 | 271 |
| 295 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 272 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
| 296 | 273 |
| 297 template <int k> uint16_t kth() const { | 274 uint16_t operator[](int k) const { |
| 298 SkASSERT(0 <= k && k < 8); | 275 SkASSERT(0 <= k && k < 8); |
| 299 return vgetq_lane_u16(fVec, k&7); | 276 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec}; |
| 277 return pun.us[k&7]; |
| 300 } | 278 } |
| 279 template <int k> uint16_t kth() const { return (*this)[k]; } |
| 301 | 280 |
| 302 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 281 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 303 return vbslq_u16(fVec, t.fVec, e.fVec); | 282 return vbslq_u16(fVec, t.fVec, e.fVec); |
| 304 } | 283 } |
| 305 | 284 |
| 306 uint16x8_t fVec; | 285 uint16x8_t fVec; |
| 307 }; | 286 }; |
| 308 | 287 |
| 309 template <> | 288 template <> |
| 310 class SkNx<4, uint8_t> { | 289 class SkNx<4, uint8_t> { |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 343 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } | 322 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } |
| 344 | 323 |
| 345 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } | 324 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } |
| 346 | 325 |
| 347 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } | 326 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } |
| 348 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } | 327 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } |
| 349 | 328 |
| 350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } | 329 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } |
| 351 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } | 330 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } |
| 352 | 331 |
| 353 template <int k> uint8_t kth() const { | 332 uint8_t operator[](int k) const { |
| 354 SkASSERT(0 <= k && k < 15); | 333 SkASSERT(0 <= k && k < 16); |
| 355 return vgetq_lane_u8(fVec, k&16); | 334 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec}; |
| 335 return pun.us[k&15]; |
| 356 } | 336 } |
| 337 template <int k> uint8_t kth() const { return (*this)[k]; } |
| 357 | 338 |
| 358 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 339 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 359 return vbslq_u8(fVec, t.fVec, e.fVec); | 340 return vbslq_u8(fVec, t.fVec, e.fVec); |
| 360 } | 341 } |
| 361 | 342 |
| 362 uint8x16_t fVec; | 343 uint8x16_t fVec; |
| 363 }; | 344 }; |
| 364 | 345 |
| 365 #undef SHIFT32 | 346 #undef SHIFT32 |
| 366 #undef SHIFT16 | 347 #undef SHIFT16 |
| 367 #undef SHIFT8 | 348 #undef SHIFT8 |
| 368 | 349 |
| 369 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 350 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
| 370 return vcvtq_s32_f32(src.fVec); | |
| 371 } | |
| 372 | |
| 373 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | |
| 374 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); | 351 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); |
| 375 uint16x4_t _16 = vqmovn_u32(_32); | 352 uint16x4_t _16 = vqmovn_u32(_32); |
| 376 return vqmovn_u16(vcombine_u16(_16, _16)); | 353 return vqmovn_u16(vcombine_u16(_16, _16)); |
| 377 } | 354 } |
| 378 | 355 |
| 379 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { | 356 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { |
| 380 uint16x8_t _16 = vmovl_u8 (src.fVec) ; | 357 uint16x8_t _16 = vmovl_u8 (src.fVec) ; |
| 381 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); | 358 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); |
| 382 return vcvtq_f32_u32(_32); | 359 return vcvtq_f32_u32(_32); |
| 383 } | 360 } |
| 384 | 361 |
| 385 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 362 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 386 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { | 363 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 387 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), | 364 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), |
| 388 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], | 365 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], |
| 389 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), | 366 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), |
| 390 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); | 367 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); |
| 391 } | 368 } |
| 392 | 369 |
| 393 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) { | 370 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { |
| 394 return vget_low_u16(vmovl_u8(src.fVec)); | 371 return vget_low_u16(vmovl_u8(src.fVec)); |
| 395 } | 372 } |
| 396 | 373 |
| 397 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 374 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { |
| 398 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); | 375 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); |
| 399 } | 376 } |
| 400 | 377 |
| 401 } // namespace | |
| 402 | |
| 403 #endif//SkNx_neon_DEFINED | 378 #endif//SkNx_neon_DEFINED |
| OLD | NEW |