| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2015 Google Inc. | 2  * Copyright 2015 Google Inc. | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED | 
| 9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED | 
| 10 | 10 | 
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> | 
| 12 | 12 | 
| 13 #define SKNX_IS_FAST | 13 #define SKNX_IS_FAST | 
| 14 | 14 | 
| 15 // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it: | 15 // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it: | 
| 16 //   - roundtrip through integers via truncation | 16 //   - roundtrip through integers via truncation | 
| 17 //   - subtract 1 if that's too big (possible for negative values). | 17 //   - subtract 1 if that's too big (possible for negative values). | 
| 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.  
     Seems plenty big. | 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.  
     Seems plenty big. | 
| 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 
| 20     float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 20     float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 
| 21     uint32x4_t too_big = roundtrip > v; | 21     uint32x4_t too_big = roundtrip > v; | 
| 22     return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
     )); | 22     return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
     )); | 
| 23 } | 23 } | 
| 24 | 24 | 
| 25 // Well, this is absurd.  The shifts require compile-time constant arguments. |  | 
| 26 |  | 
| 27 #define SHIFT8(op, v, bits) switch(bits) { \ |  | 
| 28     case  1: return op(v,  1);  case  2: return op(v,  2);  case  3: return op(v
     ,  3); \ |  | 
| 29     case  4: return op(v,  4);  case  5: return op(v,  5);  case  6: return op(v
     ,  6); \ |  | 
| 30     case  7: return op(v,  7); \ |  | 
| 31     } return fVec |  | 
| 32 |  | 
| 33 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
      { \ |  | 
| 34                                 case  8: return op(v,  8);  case  9: return op(v
     ,  9); \ |  | 
| 35     case 10: return op(v, 10);  case 11: return op(v, 11);  case 12: return op(v
     , 12); \ |  | 
| 36     case 13: return op(v, 13);  case 14: return op(v, 14);  case 15: return op(v
     , 15); \ |  | 
| 37     } return fVec |  | 
| 38 |  | 
| 39 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
     s) { \ |  | 
| 40     case 16: return op(v, 16);  case 17: return op(v, 17);  case 18: return op(v
     , 18); \ |  | 
| 41     case 19: return op(v, 19);  case 20: return op(v, 20);  case 21: return op(v
     , 21); \ |  | 
| 42     case 22: return op(v, 22);  case 23: return op(v, 23);  case 24: return op(v
     , 24); \ |  | 
| 43     case 25: return op(v, 25);  case 26: return op(v, 26);  case 27: return op(v
     , 27); \ |  | 
| 44     case 28: return op(v, 28);  case 29: return op(v, 29);  case 30: return op(v
     , 30); \ |  | 
| 45     case 31: return op(v, 31); } return fVec |  | 
| 46 |  | 
| 47 template <> | 25 template <> | 
| 48 class SkNx<2, float> { | 26 class SkNx<2, float> { | 
| 49 public: | 27 public: | 
| 50     SkNx(float32x2_t vec) : fVec(vec) {} | 28     SkNx(float32x2_t vec) : fVec(vec) {} | 
| 51 | 29 | 
| 52     SkNx() {} | 30     SkNx() {} | 
| 53     SkNx(float a, float b) : fVec{a,b} {} | 31     SkNx(float a, float b) : fVec{a,b} {} | 
| 54     SkNx(float v)          : fVec{v,v} {} | 32     SkNx(float v)          : fVec{v,v} {} | 
| 55 | 33 | 
| 56     static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 34     static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 197     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} | 175     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} | 
| 198     SkNx(uint16_t v)                                     : fVec{v,v,v,v} {} | 176     SkNx(uint16_t v)                                     : fVec{v,v,v,v} {} | 
| 199 | 177 | 
| 200     static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } | 178     static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } | 
| 201     void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } | 179     void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } | 
| 202 | 180 | 
| 203     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 181     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 
| 204     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 182     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 
| 205     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 183     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 
| 206 | 184 | 
| 207     SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 185     SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 
| 208     SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 186     SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 
| 209 | 187 | 
| 210     static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
     ec); } | 188     static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
     ec); } | 
| 211 | 189 | 
| 212     uint16_t operator[](int k) const { return fVec[k&3]; } | 190     uint16_t operator[](int k) const { return fVec[k&3]; } | 
| 213 | 191 | 
| 214     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 192     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 
| 215         return vbsl_u16(fVec, t.fVec, e.fVec); | 193         return vbsl_u16(fVec, t.fVec, e.fVec); | 
| 216     } | 194     } | 
| 217 | 195 | 
| 218     uint16x4_t fVec; | 196     uint16x4_t fVec; | 
| 219 }; | 197 }; | 
| 220 | 198 | 
| 221 template <> | 199 template <> | 
| 222 class SkNx<8, uint16_t> { | 200 class SkNx<8, uint16_t> { | 
| 223 public: | 201 public: | 
| 224     SkNx(const uint16x8_t& vec) : fVec(vec) {} | 202     SkNx(const uint16x8_t& vec) : fVec(vec) {} | 
| 225 | 203 | 
| 226     SkNx() {} | 204     SkNx() {} | 
| 227     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 205     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 
| 228          uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
      {} | 206          uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
      {} | 
| 229     SkNx(uint16_t v)                                     : fVec{v,v,v,v,v,v,v,v}
      {} | 207     SkNx(uint16_t v)                                     : fVec{v,v,v,v,v,v,v,v}
      {} | 
| 230 | 208 | 
| 231     static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); 
     } | 209     static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); 
     } | 
| 232     void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } | 210     void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } | 
| 233 | 211 | 
| 234     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 212     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 
| 235     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 213     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 
| 236     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 214     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 
| 237 | 215 | 
| 238     SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 216     SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 
| 239     SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 217     SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 
| 240 | 218 | 
| 241     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
     Vec); } | 219     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
     Vec); } | 
| 242 | 220 | 
| 243     uint16_t operator[](int k) const { return fVec[k&7]; } | 221     uint16_t operator[](int k) const { return fVec[k&7]; } | 
| 244 | 222 | 
| 245     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 223     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 
| 246         return vbslq_u16(fVec, t.fVec, e.fVec); | 224         return vbslq_u16(fVec, t.fVec, e.fVec); | 
| 247     } | 225     } | 
| 248 | 226 | 
| 249     uint16x8_t fVec; | 227     uint16x8_t fVec; | 
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 317     void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } | 295     void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } | 
| 318 | 296 | 
| 319     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 297     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 
| 320     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 298     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 
| 321     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 299     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 
| 322 | 300 | 
| 323     SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 301     SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 
| 324     SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 302     SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 
| 325     SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 303     SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 
| 326 | 304 | 
| 327     SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 305     SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 
| 328     SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 306     SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 
| 329 | 307 | 
| 330     SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 308     SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 
| 331     SkNx operator <  (const SkNx& o) const { return fVec <  o.fVec; } | 309     SkNx operator <  (const SkNx& o) const { return fVec <  o.fVec; } | 
| 332     SkNx operator >  (const SkNx& o) const { return fVec >  o.fVec; } | 310     SkNx operator >  (const SkNx& o) const { return fVec >  o.fVec; } | 
| 333 | 311 | 
| 334     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
     Vec); } | 312     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
     Vec); } | 
| 335 | 313 | 
| 336     int32_t operator[](int k) const { return fVec[k&3]; } | 314     int32_t operator[](int k) const { return fVec[k&3]; } | 
| 337 | 315 | 
| 338     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 316     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 
| (...skipping 16 matching lines...) Expand all  Loading... | 
| 355     void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } | 333     void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } | 
| 356 | 334 | 
| 357     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 335     SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 
| 358     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 336     SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 
| 359     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 337     SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 
| 360 | 338 | 
| 361     SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 339     SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 
| 362     SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 340     SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 
| 363     SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 341     SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 
| 364 | 342 | 
| 365     SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); } | 343     SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 
| 366     SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); } | 344     SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 
| 367 | 345 | 
| 368     SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 346     SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 
| 369     SkNx operator <  (const SkNx& o) const { return fVec <  o.fVec; } | 347     SkNx operator <  (const SkNx& o) const { return fVec <  o.fVec; } | 
| 370     SkNx operator >  (const SkNx& o) const { return fVec >  o.fVec; } | 348     SkNx operator >  (const SkNx& o) const { return fVec >  o.fVec; } | 
| 371 | 349 | 
| 372     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
     Vec); } | 350     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
     Vec); } | 
| 373 | 351 | 
| 374     uint32_t operator[](int k) const { return fVec[k&3]; } | 352     uint32_t operator[](int k) const { return fVec[k&3]; } | 
| 375 | 353 | 
| 376     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 354     SkNx thenElse(const SkNx& t, const SkNx& e) const { | 
| 377         return vbslq_u32(fVec, t.fVec, e.fVec); | 355         return vbslq_u32(fVec, t.fVec, e.fVec); | 
| 378     } | 356     } | 
| 379 | 357 | 
| 380     uint32x4_t fVec; | 358     uint32x4_t fVec; | 
| 381 }; | 359 }; | 
| 382 | 360 | 
| 383 #undef SHIFT32 |  | 
| 384 #undef SHIFT16 |  | 
| 385 #undef SHIFT8 |  | 
| 386 |  | 
| 387 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { | 361 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { | 
| 388     return vcvtq_s32_f32(src.fVec); | 362     return vcvtq_s32_f32(src.fVec); | 
| 389 | 363 | 
| 390 } | 364 } | 
| 391 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { | 365 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { | 
| 392     return vcvtq_f32_s32(src.fVec); | 366     return vcvtq_f32_s32(src.fVec); | 
| 393 } | 367 } | 
| 394 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { | 368 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { | 
| 395     return SkNx_cast<float>(Sk4i::Load(&src)); | 369     return SkNx_cast<float>(Sk4i::Load(&src)); | 
| 396 } | 370 } | 
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 466     uint16x4x4_t rgba = {{ | 440     uint16x4x4_t rgba = {{ | 
| 467         r.fVec, | 441         r.fVec, | 
| 468         g.fVec, | 442         g.fVec, | 
| 469         b.fVec, | 443         b.fVec, | 
| 470         a.fVec, | 444         a.fVec, | 
| 471     }}; | 445     }}; | 
| 472     vst4_u16((uint16_t*) dst, rgba); | 446     vst4_u16((uint16_t*) dst, rgba); | 
| 473 } | 447 } | 
| 474 | 448 | 
| 475 #endif//SkNx_neon_DEFINED | 449 #endif//SkNx_neon_DEFINED | 
| OLD | NEW | 
|---|