OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 | 12 |
13 #define SKNX_IS_FAST | 13 #define SKNX_IS_FAST |
14 | 14 |
15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: | 15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: |
16 // - roundtrip through integers via truncation | 16 // - roundtrip through integers via truncation |
17 // - subtract 1 if that's too big (possible for negative values). | 17 // - subtract 1 if that's too big (possible for negative values). |
18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. | 18 // This restricts the domain of our inputs to a maximum somehwere around 2^31.
Seems plenty big. |
19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { | 19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { |
20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 20 float32x4_t roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
21 uint32x4_t too_big = roundtrip > v; | 21 uint32x4_t too_big = roundtrip > v; |
22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); | 22 return roundtrip - (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1
)); |
23 } | 23 } |
24 | 24 |
25 // Well, this is absurd. The shifts require compile-time constant arguments. | |
26 | |
27 #define SHIFT8(op, v, bits) switch(bits) { \ | |
28 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ | |
29 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ | |
30 case 7: return op(v, 7); \ | |
31 } return fVec | |
32 | |
33 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ | |
34 case 8: return op(v, 8); case 9: return op(v
, 9); \ | |
35 case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v
, 12); \ | |
36 case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v
, 15); \ | |
37 } return fVec | |
38 | |
39 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ | |
40 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ | |
41 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ | |
42 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ | |
43 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ | |
44 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ | |
45 case 31: return op(v, 31); } return fVec | |
46 | |
47 template <> | 25 template <> |
48 class SkNx<2, float> { | 26 class SkNx<2, float> { |
49 public: | 27 public: |
50 SkNx(float32x2_t vec) : fVec(vec) {} | 28 SkNx(float32x2_t vec) : fVec(vec) {} |
51 | 29 |
52 SkNx() {} | 30 SkNx() {} |
53 SkNx(float a, float b) : fVec{a,b} {} | 31 SkNx(float a, float b) : fVec{a,b} {} |
54 SkNx(float v) : fVec{v,v} {} | 32 SkNx(float v) : fVec{v,v} {} |
55 | 33 |
56 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 34 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
197 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} | 175 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec{a,b,c,d} {} |
198 SkNx(uint16_t v) : fVec{v,v,v,v} {} | 176 SkNx(uint16_t v) : fVec{v,v,v,v} {} |
199 | 177 |
200 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } | 178 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } |
201 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } | 179 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } |
202 | 180 |
203 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 181 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
204 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 182 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
205 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 183 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
206 | 184 |
207 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 185 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
208 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 186 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
209 | 187 |
210 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 188 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
211 | 189 |
212 uint16_t operator[](int k) const { return fVec[k&3]; } | 190 uint16_t operator[](int k) const { return fVec[k&3]; } |
213 | 191 |
214 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 192 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
215 return vbsl_u16(fVec, t.fVec, e.fVec); | 193 return vbsl_u16(fVec, t.fVec, e.fVec); |
216 } | 194 } |
217 | 195 |
218 uint16x4_t fVec; | 196 uint16x4_t fVec; |
219 }; | 197 }; |
220 | 198 |
221 template <> | 199 template <> |
222 class SkNx<8, uint16_t> { | 200 class SkNx<8, uint16_t> { |
223 public: | 201 public: |
224 SkNx(const uint16x8_t& vec) : fVec(vec) {} | 202 SkNx(const uint16x8_t& vec) : fVec(vec) {} |
225 | 203 |
226 SkNx() {} | 204 SkNx() {} |
227 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 205 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
228 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} | 206 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec{a,b,c,d,e,f,g,h}
{} |
229 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} | 207 SkNx(uint16_t v) : fVec{v,v,v,v,v,v,v,v}
{} |
230 | 208 |
231 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} | 209 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} |
232 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } | 210 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } |
233 | 211 |
234 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 212 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
235 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 213 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
236 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 214 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
237 | 215 |
238 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 216 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
239 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 217 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
240 | 218 |
241 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 219 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
242 | 220 |
243 uint16_t operator[](int k) const { return fVec[k&7]; } | 221 uint16_t operator[](int k) const { return fVec[k&7]; } |
244 | 222 |
245 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 223 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
246 return vbslq_u16(fVec, t.fVec, e.fVec); | 224 return vbslq_u16(fVec, t.fVec, e.fVec); |
247 } | 225 } |
248 | 226 |
249 uint16x8_t fVec; | 227 uint16x8_t fVec; |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
317 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } | 295 void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } |
318 | 296 |
319 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 297 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
320 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 298 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
321 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 299 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
322 | 300 |
323 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 301 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
324 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 302 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
325 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 303 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
326 | 304 |
327 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 305 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
328 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 306 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
329 | 307 |
330 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 308 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
331 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 309 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
332 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 310 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
333 | 311 |
334 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } | 312 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } |
335 | 313 |
336 int32_t operator[](int k) const { return fVec[k&3]; } | 314 int32_t operator[](int k) const { return fVec[k&3]; } |
337 | 315 |
338 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 316 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
(...skipping 16 matching lines...) Expand all Loading... |
355 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } | 333 void store(void* ptr) const { return vst1q_u32((uint32_t*)ptr, fVec); } |
356 | 334 |
357 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } | 335 SkNx operator + (const SkNx& o) const { return fVec + o.fVec; } |
358 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } | 336 SkNx operator - (const SkNx& o) const { return fVec - o.fVec; } |
359 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } | 337 SkNx operator * (const SkNx& o) const { return fVec * o.fVec; } |
360 | 338 |
361 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } | 339 SkNx operator & (const SkNx& o) const { return fVec & o.fVec; } |
362 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } | 340 SkNx operator | (const SkNx& o) const { return fVec | o.fVec; } |
363 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } | 341 SkNx operator ^ (const SkNx& o) const { return fVec ^ o.fVec; } |
364 | 342 |
365 SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); } | 343 SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } |
366 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); } | 344 SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } |
367 | 345 |
368 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } | 346 SkNx operator == (const SkNx& o) const { return fVec == o.fVec; } |
369 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } | 347 SkNx operator < (const SkNx& o) const { return fVec < o.fVec; } |
370 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } | 348 SkNx operator > (const SkNx& o) const { return fVec > o.fVec; } |
371 | 349 |
372 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } | 350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.f
Vec); } |
373 | 351 |
374 uint32_t operator[](int k) const { return fVec[k&3]; } | 352 uint32_t operator[](int k) const { return fVec[k&3]; } |
375 | 353 |
376 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 354 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
377 return vbslq_u32(fVec, t.fVec, e.fVec); | 355 return vbslq_u32(fVec, t.fVec, e.fVec); |
378 } | 356 } |
379 | 357 |
380 uint32x4_t fVec; | 358 uint32x4_t fVec; |
381 }; | 359 }; |
382 | 360 |
383 #undef SHIFT32 | |
384 #undef SHIFT16 | |
385 #undef SHIFT8 | |
386 | |
387 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { | 361 template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { |
388 return vcvtq_s32_f32(src.fVec); | 362 return vcvtq_s32_f32(src.fVec); |
389 | 363 |
390 } | 364 } |
391 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { | 365 template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { |
392 return vcvtq_f32_s32(src.fVec); | 366 return vcvtq_f32_s32(src.fVec); |
393 } | 367 } |
394 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { | 368 template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { |
395 return SkNx_cast<float>(Sk4i::Load(&src)); | 369 return SkNx_cast<float>(Sk4i::Load(&src)); |
396 } | 370 } |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
466 uint16x4x4_t rgba = {{ | 440 uint16x4x4_t rgba = {{ |
467 r.fVec, | 441 r.fVec, |
468 g.fVec, | 442 g.fVec, |
469 b.fVec, | 443 b.fVec, |
470 a.fVec, | 444 a.fVec, |
471 }}; | 445 }}; |
472 vst4_u16((uint16_t*) dst, rgba); | 446 vst4_u16((uint16_t*) dst, rgba); |
473 } | 447 } |
474 | 448 |
475 #endif//SkNx_neon_DEFINED | 449 #endif//SkNx_neon_DEFINED |
OLD | NEW |