| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
| 9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 | 12 |
| 13 #define SKNX_IS_FAST | 13 #define SKNX_IS_FAST |
| 14 | 14 |
| 15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: | 15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: |
| 16 // - roundtrip through integers via truncation | 16 // - roundtrip through integers via truncation |
| 17 // - subtract 1 if that's too big (possible for negative values). | 17 // - subtract 1 if that's too big (possible for negative values). |
| 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. | 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. |
| 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { |
| 20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
| 21 uint32x4_t too_big = roundtrip > v; | 21 uint32x4_t too_big = roundtrip > v; |
| 22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); | 22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); |
| 23 } | 23 } |
| 24 | 24 |
| 25 // Well, this is absurd. The shifts require compile-time constant arguments. | |
| 26 | |
| 27 #define SHIFT8(op, v, bits) switch(bits) { \ | |
| 28 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ | |
| 29 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ | |
| 30 case 7: return op(v, 7); \ | |
| 31 } return fVec | |
| 32 | |
| 33 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ | |
| 34 case 8: return op(v, 8); case 9: return op(v
, 9); \ | |
| 35 case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v
, 12); \ | |
| 36 case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v
, 15); \ | |
| 37 } return fVec | |
| 38 | |
| 39 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ | |
| 40 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ | |
| 41 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ | |
| 42 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ | |
| 43 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ | |
| 44 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ | |
| 45 case 31: return op(v, 31); } return fVec | |
| 46 | |
| 47 template <> | 25 template <> |
| 48 class SkNx<2, float> { | 26 class SkNx<2, float> { |
| 49 public: | 27 public: |
| 50 SkNx(float32x2_t vec) : fVec(vec) {} | 28 SkNx(float32x2_t vec) : fVec(vec) {} |
| 51 | 29 |
| 52 SkNx() {} | 30 SkNx() {} |
| 53 SkNx(float a, float b) : fVec{a,b} {} | 31 SkNx(float a, float b) : fVec{a,b} {} |
| 54 SkNx(float v) : fVec{v,v} {} | 32 SkNx(float v) : fVec{v,v} {} |
| 55 | 33 |
| 56 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 34 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } |
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 197 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} | 175 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} |
| 198 SkNx(uint16_t v) : fVec{v,v,v,v} {} | 176 SkNx(uint16_t v) : fVec{v,v,v,v} {} |
| 199 | 177 |
| 200 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } | 178 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } |
| 201 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } | 179 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } |
| 202 | 180 |
| 203 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 181 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
| 204 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 182 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
| 205 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 183 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
| 206 | 184 |
| 207 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 185 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
| 208 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 186 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
| 209 | 187 |
| 210 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 188 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
| 211 | 189 |
| 212 uint16_t operator[](int k) const { return fVec[k&3]; } | 190 uint16_t operator[](int k) const { return fVec[k&3]; } |
| 213 | 191 |
| 214 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 192 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 215 return vbsl_u16(fVec, t.fVec, e.fVec); | 193 return vbsl_u16(fVec, t.fVec, e.fVec); |
| 216 } | 194 } |
| 217 | 195 |
| 218 uint16x4_t fVec; | 196 uint16x4_t fVec; |
| 219 }; | 197 }; |
| 220 | 198 |
| 221 template <> | 199 template <> |
| 222 class SkNx<8, uint16_t> { | 200 class SkNx<8, uint16_t> { |
| 223 public: | 201 public: |
| 224 SkNx(const uint16x8_t& vec) : fVec(vec) {} | 202 SkNx(const uint16x8_t& vec) : fVec(vec) {} |
| 225 | 203 |
| 226 SkNx() {} | 204 SkNx() {} |
| 227 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 205 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
| 228 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} | 206 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} |
| 229 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} | 207 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} |
| 230 | 208 |
| 231 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} | 209 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} |
| 232 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } | 210 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } |
| 233 | 211 |
| 234 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 212 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
| 235 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 213 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
| 236 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 214 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
| 237 | 215 |
| 238 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 216 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
| 239 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 217 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
| 240 | 218 |
| 241 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 219 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
| 242 | 220 |
| 243 uint16_t operator[](int k) const { return fVec[k&7]; } | 221 uint16_t operator[](int k) const { return fVec[k&7]; } |
| 244 | 222 |
| 245 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 223 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 246 return vbslq_u16(fVec, t.fVec, e.fVec); | 224 return vbslq_u16(fVec, t.fVec, e.fVec); |
| 247 } | 225 } |
| 248 | 226 |
| 249 uint16x8_t fVec; | 227 uint16x8_t fVec; |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 317 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } | 295 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } |
| 318 | 296 |
| 319 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 297 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
| 320 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 298 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
| 321 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 299 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
| 322 | 300 |
| 323 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 301 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
| 324 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 302 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
| 325 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 303 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
| 326 | 304 |
| 327 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 305 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
| 328 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 306 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
| 329 | 307 |
| 330 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 308 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
| 331 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 309 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
| 332 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 310 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
| 333 | 311 |
| 334 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } | 312 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } |
| 335 | 313 |
| 336 int32_t operator[](int k) const { return fVec[k&3]; } | 314 int32_t operator[](int k) const { return fVec[k&3]; } |
| 337 | 315 |
| 338 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 316 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| (...skipping 16 matching lines...) Expand all Loading... |
| 355 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } | 333 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } |
| 356 | 334 |
| 357 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 335 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
| 358 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 336 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
| 359 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 337 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
| 360 | 338 |
| 361 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 339 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
| 362 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 340 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
| 363 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 341 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
| 364 | 342 |
| 365 SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); } | 343 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
| 366 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); } | 344 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
| 367 | 345 |
| 368 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 346 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
| 369 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 347 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
| 370 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 348 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
| 371 | 349 |
| 372 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } | 350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } |
| 373 | 351 |
| 374 uint32_t operator[](int k) const { return fVec[k&3]; } | 352 uint32_t operator[](int k) const { return fVec[k&3]; } |
| 375 | 353 |
| 376 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 354 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 377 return vbslq_u32(fVec, t.fVec, e.fVec); | 355 return vbslq_u32(fVec, t.fVec, e.fVec); |
| 378 } | 356 } |
| 379 | 357 |
| 380 uint32x4_t fVec; | 358 uint32x4_t fVec; |
| 381 }; | 359 }; |
| 382 | 360 |
| 383 #undef SHIFT32 | |
| 384 #undef SHIFT16 | |
| 385 #undef SHIFT8 | |
| 386 | |
| 387 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { | 361 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { |
| 388 return vcvtq_s32_f32(src.fVec); | 362 return vcvtq_s32_f32(src.fVec); |
| 389 | 363 |
| 390 } | 364 } |
| 391 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { | 365 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { |
| 392 return vcvtq_f32_s32(src.fVec); | 366 return vcvtq_f32_s32(src.fVec); |
| 393 } | 367 } |
| 394 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { | 368 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { |
| 395 return SkNx_cast<float>(Sk4i::Load(&src)); | 369 return SkNx_cast<float>(Sk4i::Load(&src)); |
| 396 } | 370 } |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 466 uint16x4x4_t rgba = {{ | 440 uint16x4x4_t rgba = {{ |
| 467 r.fVec, | 441 r.fVec, |
| 468 g.fVec, | 442 g.fVec, |
| 469 b.fVec, | 443 b.fVec, |
| 470 a.fVec, | 444 a.fVec, |
| 471 }}; | 445 }}; |
| 472 vst4_u16((uint16_t*) dst, rgba); | 446 vst4_u16((uint16_t*) dst, rgba); |
| 473 } | 447 } |
| 474 | 448 |
| 475 #endif//SkNx_neon_DEFINED | 449 #endif//SkNx_neon_DEFINED |
| OLD | NEW |