Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(161)

Side by Side Diff: src/opts/SkNx_neon.h

Issue 2133413002: try to speed-up maprect + round2i + contains (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: add dox Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkNx_neon_DEFINED
9 #define SkNx_neon_DEFINED
10
11 #include <arm_neon.h>
12
13 #define SKNX_IS_FAST
14
15 // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it:
16 // - roundtrip through integers via truncation
17 // - subtract 1 if that's too big (possible for negative values).
18 // This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
19 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) {
20 auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
21 auto too_big = vcgtq_f32(roundtrip, v);
22 return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdup q_n_f32(1)));
23 }
24
25 // Well, this is absurd. The shifts require compile-time constant arguments.
26
27 #define SHIFT8(op, v, bits) switch(bits) { \
28 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v , 3); \
29 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v , 6); \
30 case 7: return op(v, 7); \
31 } return fVec
32
33 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits) { \
34 case 8: return op(v, 8); case 9: return op(v , 9); \
35 case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v , 12); \
36 case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v , 15); \
37 } return fVec
38
39 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit s) { \
40 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v , 18); \
41 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v , 21); \
42 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v , 24); \
43 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v , 27); \
44 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v , 30); \
45 case 31: return op(v, 31); } return fVec
46
47 template <>
48 class SkNx<2, float> {
49 public:
50 SkNx(float32x2_t vec) : fVec(vec) {}
51
52 SkNx() {}
53 SkNx(float val) : fVec(vdup_n_f32(val)) {}
54 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
55 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
56
57 void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
58
59 SkNx invert() const {
60 float32x2_t est0 = vrecpe_f32(fVec),
61 est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
62 return est1;
63 }
64
65 SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
66 SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
67 SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
68 SkNx operator / (const SkNx& o) const {
69 #if defined(SK_CPU_ARM64)
70 return vdiv_f32(fVec, o.fVec);
71 #else
72 float32x2_t est0 = vrecpe_f32(o.fVec),
73 est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
74 est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
75 return vmul_f32(fVec, est2);
76 #endif
77 }
78
79 SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f3 2(fVec, o.fVec)); }
80 SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f3 2(fVec, o.fVec)); }
81 SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f3 2(fVec, o.fVec)); }
82 SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f3 2(fVec, o.fVec)); }
83 SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f3 2(fVec, o.fVec)); }
84 SkNx operator != (const SkNx& o) const {
85 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
86 }
87
88 static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fV ec); }
89 static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fV ec); }
90
91 SkNx rsqrt() const {
92 float32x2_t est0 = vrsqrte_f32(fVec);
93 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
94 }
95
96 SkNx sqrt() const {
97 #if defined(SK_CPU_ARM64)
98 return vsqrt_f32(fVec);
99 #else
100 float32x2_t est0 = vrsqrte_f32(fVec),
101 est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est 0),
102 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est 1);
103 return vmul_f32(fVec, est2);
104 #endif
105 }
106
107 float operator[](int k) const {
108 SkASSERT(0 <= k && k < 2);
109 union { float32x2_t v; float fs[2]; } pun = {fVec};
110 return pun.fs[k&1];
111 }
112
113 bool allTrue() const {
114 auto v = vreinterpret_u32_f32(fVec);
115 return vget_lane_u32(v,0) && vget_lane_u32(v,1);
116 }
117 bool anyTrue() const {
118 auto v = vreinterpret_u32_f32(fVec);
119 return vget_lane_u32(v,0) || vget_lane_u32(v,1);
120 }
121
122 float32x2_t fVec;
123 };
124
125 template <>
126 class SkNx<4, float> {
127 public:
128 SkNx(float32x4_t vec) : fVec(vec) {}
129
130 SkNx() {}
131 SkNx(float val) : fVec(vdupq_n_f32(val)) {}
132 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
133 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
134
135 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
136 SkNx invert() const {
137 float32x4_t est0 = vrecpeq_f32(fVec),
138 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
139 return est1;
140 }
141
142 SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
143 SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
144 SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
145 SkNx operator / (const SkNx& o) const {
146 #if defined(SK_CPU_ARM64)
147 return vdivq_f32(fVec, o.fVec);
148 #else
149 float32x4_t est0 = vrecpeq_f32(o.fVec),
150 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
151 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
152 return vmulq_f32(fVec, est2);
153 #endif
154 }
155
156 SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f3 2(fVec, o.fVec)); }
157 SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f3 2(fVec, o.fVec)); }
158 SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f3 2(fVec, o.fVec)); }
159 SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f3 2(fVec, o.fVec)); }
160 SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f3 2(fVec, o.fVec)); }
161 SkNx operator!=(const SkNx& o) const {
162 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
163 }
164
165 static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.f Vec); }
166 static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.f Vec); }
167
168 SkNx abs() const { return vabsq_f32(fVec); }
169 SkNx floor() const {
170 #if defined(SK_CPU_ARM64)
171 return vrndmq_f32(fVec);
172 #else
173 return armv7_vrndmq_f32(fVec);
174 #endif
175 }
176
177
178 SkNx rsqrt() const {
179 float32x4_t est0 = vrsqrteq_f32(fVec);
180 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
181 }
182
183 SkNx sqrt() const {
184 #if defined(SK_CPU_ARM64)
185 return vsqrtq_f32(fVec);
186 #else
187 float32x4_t est0 = vrsqrteq_f32(fVec),
188 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
189 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
190 return vmulq_f32(fVec, est2);
191 #endif
192 }
193
194 float operator[](int k) const {
195 SkASSERT(0 <= k && k < 4);
196 union { float32x4_t v; float fs[4]; } pun = {fVec};
197 return pun.fs[k&3];
198 }
199
200 bool allTrue() const {
201 auto v = vreinterpretq_u32_f32(fVec);
202 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
203 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
204 }
205 bool anyTrue() const {
206 auto v = vreinterpretq_u32_f32(fVec);
207 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
208 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
209 }
210
211 SkNx thenElse(const SkNx& t, const SkNx& e) const {
212 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
213 }
214
215 float32x4_t fVec;
216 };
217
218 // It's possible that for our current use cases, representing this as
219 // half a uint16x8_t might be better than representing it as a uint16x4_t.
220 // It'd make conversion to Sk4b one step simpler.
221 template <>
222 class SkNx<4, uint16_t> {
223 public:
224 SkNx(const uint16x4_t& vec) : fVec(vec) {}
225
226 SkNx() {}
227 SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
228 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
229
230 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
231 fVec = (uint16x4_t) { a,b,c,d };
232 }
233
234 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
235
236 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
237 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
238 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
239
240 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); }
241 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); }
242
243 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV ec); }
244
245 uint16_t operator[](int k) const {
246 SkASSERT(0 <= k && k < 4);
247 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
248 return pun.us[k&3];
249 }
250
251 SkNx thenElse(const SkNx& t, const SkNx& e) const {
252 return vbsl_u16(fVec, t.fVec, e.fVec);
253 }
254
255 uint16x4_t fVec;
256 };
257
258 template <>
259 class SkNx<8, uint16_t> {
260 public:
261 SkNx(const uint16x8_t& vec) : fVec(vec) {}
262
263 SkNx() {}
264 SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
265 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
266
267 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
268 uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
269 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
270 }
271
272 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
273
274 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
275 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
276 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
277
278 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
279 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
280
281 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f Vec); }
282
283 uint16_t operator[](int k) const {
284 SkASSERT(0 <= k && k < 8);
285 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
286 return pun.us[k&7];
287 }
288
289 SkNx thenElse(const SkNx& t, const SkNx& e) const {
290 return vbslq_u16(fVec, t.fVec, e.fVec);
291 }
292
293 uint16x8_t fVec;
294 };
295
296 template <>
297 class SkNx<4, uint8_t> {
298 public:
299 SkNx(const uint8x8_t& vec) : fVec(vec) {}
300
301 SkNx() {}
302 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
303 fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
304 }
305 static SkNx Load(const void* ptr) {
306 return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr);
307 }
308 void store(void* ptr) const {
309 return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0);
310 }
311 uint8_t operator[](int k) const {
312 SkASSERT(0 <= k && k < 4);
313 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
314 return pun.us[k&3];
315 }
316
317 // TODO as needed
318
319 uint8x8_t fVec;
320 };
321
322 template <>
323 class SkNx<16, uint8_t> {
324 public:
325 SkNx(const uint8x16_t& vec) : fVec(vec) {}
326
327 SkNx() {}
328 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
329 static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
330
331 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
332 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
333 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
334 uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
335 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
336 }
337
338 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
339
340 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
341
342 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
343 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
344
345 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV ec); }
346 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
347
348 uint8_t operator[](int k) const {
349 SkASSERT(0 <= k && k < 16);
350 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
351 return pun.us[k&15];
352 }
353
354 SkNx thenElse(const SkNx& t, const SkNx& e) const {
355 return vbslq_u8(fVec, t.fVec, e.fVec);
356 }
357
358 uint8x16_t fVec;
359 };
360
361 template <>
362 class SkNx<4, int> {
363 public:
364 SkNx(const int32x4_t& vec) : fVec(vec) {}
365
366 SkNx() {}
367 SkNx(int v) {
368 fVec = vdupq_n_s32(v);
369 }
370 SkNx(int a, int b, int c, int d) {
371 fVec = (int32x4_t){a,b,c,d};
372 }
373 static SkNx Load(const void* ptr) {
374 return vld1q_s32((const int32_t*)ptr);
375 }
376 void store(void* ptr) const {
377 return vst1q_s32((int32_t*)ptr, fVec);
378 }
379 int operator[](int k) const {
380 SkASSERT(0 <= k && k < 4);
381 union { int32x4_t v; int is[4]; } pun = {fVec};
382 return pun.is[k&3];
383 }
384
385 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
386 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
387 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
388
389 SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
390
391 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
392 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
393
394 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f Vec); }
395 // TODO as needed
396
397 int32x4_t fVec;
398 };
399
400 #undef SHIFT32
401 #undef SHIFT16
402 #undef SHIFT8
403
404 template<> inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
405 return vcvtq_s32_f32(src.fVec);
406
407 }
408 template<> inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
409 return vcvtq_f32_s32(src.fVec);
410 }
411
412 template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
413 return vqmovn_u32(vcvtq_u32_f32(src.fVec));
414 }
415
416 template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
417 return vcvtq_f32_u32(vmovl_u16(src.fVec));
418 }
419
420 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
421 uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
422 uint16x4_t _16 = vqmovn_u32(_32);
423 return vqmovn_u16(vcombine_u16(_16, _16));
424 }
425
426 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
427 uint16x8_t _16 = vmovl_u8 (src.fVec) ;
428 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16));
429 return vcvtq_f32_u32(_32);
430 }
431
432 template<> inline Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
433 Sk8f ab, cd;
434 SkNx_split(src, &ab, &cd);
435
436 Sk4f a,b,c,d;
437 SkNx_split(ab, &a, &b);
438 SkNx_split(cd, &c, &d);
439 return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
440 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
441 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
442 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
443 }
444
445 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
446 return vget_low_u16(vmovl_u8(src.fVec));
447 }
448
449 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
450 return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
451 }
452
453 #endif//SkNx_neon_DEFINED
OLDNEW
« include/private/SkNx_sse.h ('K') | « src/core/SkNx.h ('k') | src/opts/SkNx_sse.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698