OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
11 #define SKNX_IS_FAST | 11 #define SKNX_IS_FAST |
12 | 12 |
13 namespace { // See SkNx.h | |
14 | |
15 // Well, this is absurd. The shifts require compile-time constant arguments. | 13 // Well, this is absurd. The shifts require compile-time constant arguments. |
16 | 14 |
17 #define SHIFT8(op, v, bits) switch(bits) { \ | 15 #define SHIFT8(op, v, bits) switch(bits) { \ |
18 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ | 16 case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v
, 3); \ |
19 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ | 17 case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v
, 6); \ |
20 case 7: return op(v, 7); \ | 18 case 7: return op(v, 7); \ |
21 } return fVec | 19 } return fVec |
22 | 20 |
23 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ | 21 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits)
{ \ |
24 case 8: return op(v, 8); case 9: return op(v
, 9); \ | 22 case 8: return op(v, 8); case 9: return op(v
, 9); \ |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
91 } | 89 } |
92 | 90 |
93 SkNx sqrt() const { | 91 SkNx sqrt() const { |
94 #if defined(SK_CPU_ARM64) | 92 #if defined(SK_CPU_ARM64) |
95 return vsqrt_f32(fVec); | 93 return vsqrt_f32(fVec); |
96 #else | 94 #else |
97 return *this * this->rsqrt2(); | 95 return *this * this->rsqrt2(); |
98 #endif | 96 #endif |
99 } | 97 } |
100 | 98 |
101 template <int k> float kth() const { | 99 float operator[](int k) const { |
102 SkASSERT(0 <= k && k < 2); | 100 SkASSERT(0 <= k && k < 2); |
103 return vget_lane_f32(fVec, k&1); | 101 union { float32x2_t v; float fs[2]; } pun = {fVec}; |
| 102 return pun.fs[k&1]; |
104 } | 103 } |
| 104 template <int k> float kth() const { return (*this)[k]; } |
105 | 105 |
106 bool allTrue() const { | 106 bool allTrue() const { |
107 auto v = vreinterpret_u32_f32(fVec); | 107 auto v = vreinterpret_u32_f32(fVec); |
108 return vget_lane_u32(v,0) && vget_lane_u32(v,1); | 108 return vget_lane_u32(v,0) && vget_lane_u32(v,1); |
109 } | 109 } |
110 bool anyTrue() const { | 110 bool anyTrue() const { |
111 auto v = vreinterpret_u32_f32(fVec); | 111 auto v = vreinterpret_u32_f32(fVec); |
112 return vget_lane_u32(v,0) || vget_lane_u32(v,1); | 112 return vget_lane_u32(v,0) || vget_lane_u32(v,1); |
113 } | 113 } |
114 | 114 |
115 float32x2_t fVec; | 115 float32x2_t fVec; |
116 }; | 116 }; |
117 | 117 |
118 template <> | 118 template <> |
119 class SkNx<4, int> { | |
120 public: | |
121 SkNx(const int32x4_t& vec) : fVec(vec) {} | |
122 | |
123 SkNx() {} | |
124 SkNx(int val) : fVec(vdupq_n_s32(val)) {} | |
125 static SkNx Load(const void* ptr) { return vld1q_s32((const int*)ptr); } | |
126 SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } | |
127 | |
128 void store(void* ptr) const { vst1q_s32((int*)ptr, fVec); } | |
129 | |
130 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } | |
131 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } | |
132 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } | |
133 | |
134 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | |
135 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | |
136 | |
137 template <int k> int kth() const { | |
138 SkASSERT(0 <= k && k < 4); | |
139 return vgetq_lane_s32(fVec, k&3); | |
140 } | |
141 | |
142 int32x4_t fVec; | |
143 }; | |
144 | |
145 template <> | |
146 class SkNx<4, float> { | 119 class SkNx<4, float> { |
147 public: | 120 public: |
148 SkNx(float32x4_t vec) : fVec(vec) {} | 121 SkNx(float32x4_t vec) : fVec(vec) {} |
149 | 122 |
150 SkNx() {} | 123 SkNx() {} |
151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 124 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
152 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } | 125 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } |
153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 126 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
154 | 127 |
155 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } | 128 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
200 } | 173 } |
201 | 174 |
202 SkNx sqrt() const { | 175 SkNx sqrt() const { |
203 #if defined(SK_CPU_ARM64) | 176 #if defined(SK_CPU_ARM64) |
204 return vsqrtq_f32(fVec); | 177 return vsqrtq_f32(fVec); |
205 #else | 178 #else |
206 return *this * this->rsqrt2(); | 179 return *this * this->rsqrt2(); |
207 #endif | 180 #endif |
208 } | 181 } |
209 | 182 |
210 template <int k> float kth() const { | 183 float operator[](int k) const { |
211 SkASSERT(0 <= k && k < 4); | 184 SkASSERT(0 <= k && k < 4); |
212 return vgetq_lane_f32(fVec, k&3); | 185 union { float32x4_t v; float fs[4]; } pun = {fVec}; |
| 186 return pun.fs[k&3]; |
213 } | 187 } |
| 188 template <int k> float kth() const { return (*this)[k]; } |
214 | 189 |
215 bool allTrue() const { | 190 bool allTrue() const { |
216 auto v = vreinterpretq_u32_f32(fVec); | 191 auto v = vreinterpretq_u32_f32(fVec); |
217 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) | 192 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) |
218 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); | 193 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); |
219 } | 194 } |
220 bool anyTrue() const { | 195 bool anyTrue() const { |
221 auto v = vreinterpretq_u32_f32(fVec); | 196 auto v = vreinterpretq_u32_f32(fVec); |
222 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) | 197 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) |
223 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); | 198 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); |
(...skipping 26 matching lines...) Expand all Loading... |
250 | 225 |
251 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } | 226 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } |
252 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } | 227 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } |
253 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } | 228 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } |
254 | 229 |
255 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 230 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } |
256 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 231 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } |
257 | 232 |
258 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 233 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
259 | 234 |
260 template <int k> uint16_t kth() const { | 235 uint16_t operator[](int k) const { |
261 SkASSERT(0 <= k && k < 4); | 236 SkASSERT(0 <= k && k < 4); |
262 return vget_lane_u16(fVec, k&3); | 237 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec}; |
| 238 return pun.us[k&3]; |
263 } | 239 } |
| 240 template <int k> uint16_t kth() const { return (*this)[k]; } |
264 | 241 |
265 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 242 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
266 return vbsl_u16(fVec, t.fVec, e.fVec); | 243 return vbsl_u16(fVec, t.fVec, e.fVec); |
267 } | 244 } |
268 | 245 |
269 uint16x4_t fVec; | 246 uint16x4_t fVec; |
270 }; | 247 }; |
271 | 248 |
272 template <> | 249 template <> |
273 class SkNx<8, uint16_t> { | 250 class SkNx<8, uint16_t> { |
(...skipping 13 matching lines...) Expand all Loading... |
287 | 264 |
288 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } | 265 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } |
289 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } | 266 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } |
290 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } | 267 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } |
291 | 268 |
292 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 269 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } |
293 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 270 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } |
294 | 271 |
295 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 272 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
296 | 273 |
297 template <int k> uint16_t kth() const { | 274 uint16_t operator[](int k) const { |
298 SkASSERT(0 <= k && k < 8); | 275 SkASSERT(0 <= k && k < 8); |
299 return vgetq_lane_u16(fVec, k&7); | 276 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec}; |
| 277 return pun.us[k&7]; |
300 } | 278 } |
| 279 template <int k> uint16_t kth() const { return (*this)[k]; } |
301 | 280 |
302 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 281 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
303 return vbslq_u16(fVec, t.fVec, e.fVec); | 282 return vbslq_u16(fVec, t.fVec, e.fVec); |
304 } | 283 } |
305 | 284 |
306 uint16x8_t fVec; | 285 uint16x8_t fVec; |
307 }; | 286 }; |
308 | 287 |
309 template <> | 288 template <> |
310 class SkNx<4, uint8_t> { | 289 class SkNx<4, uint8_t> { |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
343 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } | 322 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } |
344 | 323 |
345 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } | 324 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } |
346 | 325 |
347 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } | 326 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } |
348 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } | 327 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } |
349 | 328 |
350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } | 329 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } |
351 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } | 330 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } |
352 | 331 |
353 template <int k> uint8_t kth() const { | 332 uint8_t operator[](int k) const { |
354 SkASSERT(0 <= k && k < 15); | 333 SkASSERT(0 <= k && k < 16); |
355 return vgetq_lane_u8(fVec, k&16); | 334 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec}; |
| 335 return pun.us[k&15]; |
356 } | 336 } |
| 337 template <int k> uint8_t kth() const { return (*this)[k]; } |
357 | 338 |
358 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 339 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
359 return vbslq_u8(fVec, t.fVec, e.fVec); | 340 return vbslq_u8(fVec, t.fVec, e.fVec); |
360 } | 341 } |
361 | 342 |
362 uint8x16_t fVec; | 343 uint8x16_t fVec; |
363 }; | 344 }; |
364 | 345 |
365 #undef SHIFT32 | 346 #undef SHIFT32 |
366 #undef SHIFT16 | 347 #undef SHIFT16 |
367 #undef SHIFT8 | 348 #undef SHIFT8 |
368 | 349 |
369 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { | 350 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
370 return vcvtq_s32_f32(src.fVec); | |
371 } | |
372 | |
373 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { | |
374 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); | 351 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); |
375 uint16x4_t _16 = vqmovn_u32(_32); | 352 uint16x4_t _16 = vqmovn_u32(_32); |
376 return vqmovn_u16(vcombine_u16(_16, _16)); | 353 return vqmovn_u16(vcombine_u16(_16, _16)); |
377 } | 354 } |
378 | 355 |
379 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { | 356 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { |
380 uint16x8_t _16 = vmovl_u8 (src.fVec) ; | 357 uint16x8_t _16 = vmovl_u8 (src.fVec) ; |
381 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); | 358 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); |
382 return vcvtq_f32_u32(_32); | 359 return vcvtq_f32_u32(_32); |
383 } | 360 } |
384 | 361 |
385 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 362 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
386 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { | 363 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
387 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), | 364 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), |
388 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], | 365 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], |
389 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), | 366 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), |
390 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); | 367 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); |
391 } | 368 } |
392 | 369 |
393 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) { | 370 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { |
394 return vget_low_u16(vmovl_u8(src.fVec)); | 371 return vget_low_u16(vmovl_u8(src.fVec)); |
395 } | 372 } |
396 | 373 |
397 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 374 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { |
398 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); | 375 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); |
399 } | 376 } |
400 | 377 |
401 } // namespace | |
402 | |
403 #endif//SkNx_neon_DEFINED | 378 #endif//SkNx_neon_DEFINED |
OLD | NEW |