OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 | 12 |
13 #define SKNX_IS_FAST | 13 #define SKNX_IS_FAST |
14 | 14 |
15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: | 15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: |
16 // - roundtrip through integers via truncation | 16 // - roundtrip through integers via truncation |
17 // - subtract 1 if that's too big (possible for negative values). | 17 // - subtract 1 if that's too big (possible for negative values). |
18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. | 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. |
19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { |
20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
21 uint32x4_t too_big = roundtrip > v; | 21 uint32x4_t too_big = roundtrip > v; |
22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); | 22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); |
23 } | 23 } |
24 | 24 |
| 25 // Well, this is absurd. The shifts require compile-time constant arguments. |
| 26 |
| 27 #define SHIFT8(op, v, bits) switch(bits) { \ |
| 28 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ |
| 29 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ |
| 30 case 7: return op(v, 7); \ |
| 31 } return fVec |
| 32 |
| 33 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ |
| 34 case 8: return op(v, 8); case 9: return op(v
, 9); \ |
| 35 case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v
, 12); \ |
| 36 case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v
, 15); \ |
| 37 } return fVec |
| 38 |
| 39 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ |
| 40 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ |
| 41 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ |
| 42 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ |
| 43 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ |
| 44 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ |
| 45 case 31: return op(v, 31); } return fVec |
| 46 |
25 template <> | 47 template <> |
26 class SkNx<2, float> { | 48 class SkNx<2, float> { |
27 public: | 49 public: |
28 SkNx(float32x2_t vec) : fVec(vec) {} | 50 SkNx(float32x2_t vec) : fVec(vec) {} |
29 | 51 |
30 SkNx() {} | 52 SkNx() {} |
31 SkNx(float a, float b) : fVec{a,b} {} | 53 SkNx(float a, float b) : fVec{a,b} {} |
32 SkNx(float v) : fVec{v,v} {} | 54 SkNx(float v) : fVec{v,v} {} |
33 | 55 |
34 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 56 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
175 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} | 197 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} |
176 SkNx(uint16_t v) : fVec{v,v,v,v} {} | 198 SkNx(uint16_t v) : fVec{v,v,v,v} {} |
177 | 199 |
178 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } | 200 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } |
179 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } | 201 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } |
180 | 202 |
181 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 203 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
182 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 204 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
183 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 205 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
184 | 206 |
185 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 207 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } |
186 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 208 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } |
187 | 209 |
188 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 210 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
189 | 211 |
190 uint16_t operator[](int k) const { return fVec[k&3]; } | 212 uint16_t operator[](int k) const { return fVec[k&3]; } |
191 | 213 |
192 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 214 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
193 return vbsl_u16(fVec, t.fVec, e.fVec); | 215 return vbsl_u16(fVec, t.fVec, e.fVec); |
194 } | 216 } |
195 | 217 |
196 uint16x4_t fVec; | 218 uint16x4_t fVec; |
197 }; | 219 }; |
198 | 220 |
199 template <> | 221 template <> |
200 class SkNx<8, uint16_t> { | 222 class SkNx<8, uint16_t> { |
201 public: | 223 public: |
202 SkNx(const uint16x8_t& vec) : fVec(vec) {} | 224 SkNx(const uint16x8_t& vec) : fVec(vec) {} |
203 | 225 |
204 SkNx() {} | 226 SkNx() {} |
205 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 227 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
206 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} | 228 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} |
207 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} | 229 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} |
208 | 230 |
209 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} | 231 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} |
210 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } | 232 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } |
211 | 233 |
212 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 234 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
213 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 235 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
214 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 236 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
215 | 237 |
216 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 238 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } |
217 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 239 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } |
218 | 240 |
219 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 241 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
220 | 242 |
221 uint16_t operator[](int k) const { return fVec[k&7]; } | 243 uint16_t operator[](int k) const { return fVec[k&7]; } |
222 | 244 |
223 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 245 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
224 return vbslq_u16(fVec, t.fVec, e.fVec); | 246 return vbslq_u16(fVec, t.fVec, e.fVec); |
225 } | 247 } |
226 | 248 |
227 uint16x8_t fVec; | 249 uint16x8_t fVec; |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
295 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } | 317 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } |
296 | 318 |
297 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 319 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
298 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 320 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
299 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 321 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
300 | 322 |
301 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 323 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
302 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 324 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
303 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 325 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
304 | 326 |
305 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 327 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } |
306 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 328 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } |
307 | 329 |
308 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 330 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
309 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 331 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
310 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 332 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
311 | 333 |
312 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } | 334 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } |
313 | 335 |
314 int32_t operator[](int k) const { return fVec[k&3]; } | 336 int32_t operator[](int k) const { return fVec[k&3]; } |
315 | 337 |
316 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 338 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
(...skipping 16 matching lines...) Expand all Loading... |
333 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } | 355 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } |
334 | 356 |
335 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 357 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
336 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 358 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
337 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 359 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
338 | 360 |
339 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 361 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
340 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 362 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
341 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 363 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
342 | 364 |
343 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } | 365 SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); } |
344 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } | 366 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); } |
345 | 367 |
346 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 368 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
347 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 369 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
348 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 370 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
349 | 371 |
350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } | 372 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } |
351 | 373 |
352 uint32_t operator[](int k) const { return fVec[k&3]; } | 374 uint32_t operator[](int k) const { return fVec[k&3]; } |
353 | 375 |
354 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 376 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
355 return vbslq_u32(fVec, t.fVec, e.fVec); | 377 return vbslq_u32(fVec, t.fVec, e.fVec); |
356 } | 378 } |
357 | 379 |
358 uint32x4_t fVec; | 380 uint32x4_t fVec; |
359 }; | 381 }; |
360 | 382 |
| 383 #undef SHIFT32 |
| 384 #undef SHIFT16 |
| 385 #undef SHIFT8 |
| 386 |
361 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { | 387 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { |
362 return vcvtq_s32_f32(src.fVec); | 388 return vcvtq_s32_f32(src.fVec); |
363 | 389 |
364 } | 390 } |
365 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { | 391 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { |
366 return vcvtq_f32_s32(src.fVec); | 392 return vcvtq_f32_s32(src.fVec); |
367 } | 393 } |
368 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { | 394 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { |
369 return SkNx_cast<float>(Sk4i::Load(&src)); | 395 return SkNx_cast<float>(Sk4i::Load(&src)); |
370 } | 396 } |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
440 uint16x4x4_t rgba = {{ | 466 uint16x4x4_t rgba = {{ |
441 r.fVec, | 467 r.fVec, |
442 g.fVec, | 468 g.fVec, |
443 b.fVec, | 469 b.fVec, |
444 a.fVec, | 470 a.fVec, |
445 }}; | 471 }}; |
446 vst4_u16((uint16_t*) dst, rgba); | 472 vst4_u16((uint16_t*) dst, rgba); |
447 } | 473 } |
448 | 474 |
449 #endif//SkNx_neon_DEFINED | 475 #endif//SkNx_neon_DEFINED |
OLD | NEW |