OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
(...skipping 23 matching lines...) Expand all Loading... |
34 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ | 34 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ |
35 case 31: return op(v, 31); } return fVec | 35 case 31: return op(v, 31); } return fVec |
36 | 36 |
37 template <> | 37 template <> |
38 class SkNx<2, float> { | 38 class SkNx<2, float> { |
39 public: | 39 public: |
40 SkNx(float32x2_t vec) : fVec(vec) {} | 40 SkNx(float32x2_t vec) : fVec(vec) {} |
41 | 41 |
42 SkNx() {} | 42 SkNx() {} |
43 SkNx(float val) : fVec(vdup_n_f32(val)) {} | 43 SkNx(float val) : fVec(vdup_n_f32(val)) {} |
44 static SkNx Load(const float vals[2]) { return vld1_f32(vals); } | 44 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } |
45 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } | 45 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } |
46 | 46 |
47 void store(float vals[2]) const { vst1_f32(vals, fVec); } | 47 void store(void* ptr) const { vst1_f32((float*)ptr, fVec); } |
48 | 48 |
49 SkNx approxInvert() const { | 49 SkNx approxInvert() const { |
50 float32x2_t est0 = vrecpe_f32(fVec), | 50 float32x2_t est0 = vrecpe_f32(fVec), |
51 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); | 51 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); |
52 return est1; | 52 return est1; |
53 } | 53 } |
54 SkNx invert() const { | 54 SkNx invert() const { |
55 float32x2_t est1 = this->approxInvert().fVec, | 55 float32x2_t est1 = this->approxInvert().fVec, |
56 est2 = vmul_f32(vrecps_f32(est1, fVec), est1); | 56 est2 = vmul_f32(vrecps_f32(est1, fVec), est1); |
57 return est2; | 57 return est2; |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
115 float32x2_t fVec; | 115 float32x2_t fVec; |
116 }; | 116 }; |
117 | 117 |
118 template <> | 118 template <> |
119 class SkNx<4, int> { | 119 class SkNx<4, int> { |
120 public: | 120 public: |
121 SkNx(const int32x4_t& vec) : fVec(vec) {} | 121 SkNx(const int32x4_t& vec) : fVec(vec) {} |
122 | 122 |
123 SkNx() {} | 123 SkNx() {} |
124 SkNx(int val) : fVec(vdupq_n_s32(val)) {} | 124 SkNx(int val) : fVec(vdupq_n_s32(val)) {} |
125 static SkNx Load(const int vals[4]) { return vld1q_s32(vals); } | 125 static SkNx Load(const void* ptr) { return vld1q_s32((const int*)ptr); } |
126 SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } | 126 SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } |
127 | 127 |
128 void store(int vals[4]) const { vst1q_s32(vals, fVec); } | 128 void store(void* ptr) const { vst1q_s32((int*)ptr, fVec); } |
129 | 129 |
130 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } | 130 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } |
131 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } | 131 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } |
132 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } | 132 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } |
133 | 133 |
134 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 134 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } |
135 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 135 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } |
136 | 136 |
137 template <int k> int kth() const { | 137 template <int k> int kth() const { |
138 SkASSERT(0 <= k && k < 4); | 138 SkASSERT(0 <= k && k < 4); |
139 return vgetq_lane_s32(fVec, k&3); | 139 return vgetq_lane_s32(fVec, k&3); |
140 } | 140 } |
141 | 141 |
142 int32x4_t fVec; | 142 int32x4_t fVec; |
143 }; | 143 }; |
144 | 144 |
145 template <> | 145 template <> |
146 class SkNx<4, float> { | 146 class SkNx<4, float> { |
147 public: | 147 public: |
148 SkNx(float32x4_t vec) : fVec(vec) {} | 148 SkNx(float32x4_t vec) : fVec(vec) {} |
149 | 149 |
150 SkNx() {} | 150 SkNx() {} |
151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
152 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } | 152 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } |
153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
154 | 154 |
155 void store(float vals[4]) const { vst1q_f32(vals, fVec); } | 155 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } |
156 SkNx approxInvert() const { | 156 SkNx approxInvert() const { |
157 float32x4_t est0 = vrecpeq_f32(fVec), | 157 float32x4_t est0 = vrecpeq_f32(fVec), |
158 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 158 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
159 return est1; | 159 return est1; |
160 } | 160 } |
161 SkNx invert() const { | 161 SkNx invert() const { |
162 float32x4_t est1 = this->approxInvert().fVec, | 162 float32x4_t est1 = this->approxInvert().fVec, |
163 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | 163 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); |
164 return est2; | 164 return est2; |
165 } | 165 } |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
233 // It's possible that for our current use cases, representing this as | 233 // It's possible that for our current use cases, representing this as |
234 // half a uint16x8_t might be better than representing it as a uint16x4_t. | 234 // half a uint16x8_t might be better than representing it as a uint16x4_t. |
235 // It'd make conversion to Sk4b one step simpler. | 235 // It'd make conversion to Sk4b one step simpler. |
236 template <> | 236 template <> |
237 class SkNx<4, uint16_t> { | 237 class SkNx<4, uint16_t> { |
238 public: | 238 public: |
239 SkNx(const uint16x4_t& vec) : fVec(vec) {} | 239 SkNx(const uint16x4_t& vec) : fVec(vec) {} |
240 | 240 |
241 SkNx() {} | 241 SkNx() {} |
242 SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {} | 242 SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {} |
243 static SkNx Load(const uint16_t vals[4]) { return vld1_u16(vals); } | 243 static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } |
244 | 244 |
245 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { | 245 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { |
246 fVec = (uint16x4_t) { a,b,c,d }; | 246 fVec = (uint16x4_t) { a,b,c,d }; |
247 } | 247 } |
248 | 248 |
249 void store(uint16_t vals[4]) const { vst1_u16(vals, fVec); } | 249 void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } |
250 | 250 |
251 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } | 251 SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } |
252 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } | 252 SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } |
253 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } | 253 SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } |
254 | 254 |
255 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } | 255 SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } |
256 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } | 256 SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } |
257 | 257 |
258 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } | 258 static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fV
ec); } |
259 | 259 |
260 template <int k> uint16_t kth() const { | 260 template <int k> uint16_t kth() const { |
261 SkASSERT(0 <= k && k < 4); | 261 SkASSERT(0 <= k && k < 4); |
262 return vget_lane_u16(fVec, k&3); | 262 return vget_lane_u16(fVec, k&3); |
263 } | 263 } |
264 | 264 |
265 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 265 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
266 return vbsl_u16(fVec, t.fVec, e.fVec); | 266 return vbsl_u16(fVec, t.fVec, e.fVec); |
267 } | 267 } |
268 | 268 |
269 uint16x4_t fVec; | 269 uint16x4_t fVec; |
270 }; | 270 }; |
271 | 271 |
272 template <> | 272 template <> |
273 class SkNx<8, uint16_t> { | 273 class SkNx<8, uint16_t> { |
274 public: | 274 public: |
275 SkNx(const uint16x8_t& vec) : fVec(vec) {} | 275 SkNx(const uint16x8_t& vec) : fVec(vec) {} |
276 | 276 |
277 SkNx() {} | 277 SkNx() {} |
278 SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {} | 278 SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {} |
279 static SkNx Load(const uint16_t vals[8]) { return vld1q_u16(vals); } | 279 static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr);
} |
280 | 280 |
281 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 281 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
282 uint16_t e, uint16_t f, uint16_t g, uint16_t h) { | 282 uint16_t e, uint16_t f, uint16_t g, uint16_t h) { |
283 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; | 283 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; |
284 } | 284 } |
285 | 285 |
286 void store(uint16_t vals[8]) const { vst1q_u16(vals, fVec); } | 286 void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } |
287 | 287 |
288 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } | 288 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } |
289 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } | 289 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } |
290 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } | 290 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } |
291 | 291 |
292 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 292 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } |
293 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 293 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } |
294 | 294 |
295 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } | 295 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
296 | 296 |
297 template <int k> uint16_t kth() const { | 297 template <int k> uint16_t kth() const { |
298 SkASSERT(0 <= k && k < 8); | 298 SkASSERT(0 <= k && k < 8); |
299 return vgetq_lane_u16(fVec, k&7); | 299 return vgetq_lane_u16(fVec, k&7); |
300 } | 300 } |
301 | 301 |
302 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 302 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
303 return vbslq_u16(fVec, t.fVec, e.fVec); | 303 return vbslq_u16(fVec, t.fVec, e.fVec); |
304 } | 304 } |
305 | 305 |
306 uint16x8_t fVec; | 306 uint16x8_t fVec; |
307 }; | 307 }; |
308 | 308 |
309 template <> | 309 template <> |
310 class SkNx<4, uint8_t> { | 310 class SkNx<4, uint8_t> { |
311 public: | 311 public: |
312 SkNx(const uint8x8_t& vec) : fVec(vec) {} | 312 SkNx(const uint8x8_t& vec) : fVec(vec) {} |
313 | 313 |
314 SkNx() {} | 314 SkNx() {} |
315 static SkNx Load(const uint8_t vals[4]) { | 315 static SkNx Load(const void* ptr) { |
316 return (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); | 316 return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr); |
317 } | 317 } |
318 void store(uint8_t vals[4]) const { | 318 void store(void* ptr) const { |
319 return vst1_lane_u32((uint32_t*)vals, (uint32x2_t)fVec, 0); | 319 return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0); |
320 } | 320 } |
321 | 321 |
322 // TODO as needed | 322 // TODO as needed |
323 | 323 |
324 uint8x8_t fVec; | 324 uint8x8_t fVec; |
325 }; | 325 }; |
326 | 326 |
327 template <> | 327 template <> |
328 class SkNx<16, uint8_t> { | 328 class SkNx<16, uint8_t> { |
329 public: | 329 public: |
330 SkNx(const uint8x16_t& vec) : fVec(vec) {} | 330 SkNx(const uint8x16_t& vec) : fVec(vec) {} |
331 | 331 |
332 SkNx() {} | 332 SkNx() {} |
333 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} | 333 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} |
334 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } | 334 static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); } |
335 | 335 |
336 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 336 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
337 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 337 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
338 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 338 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
339 uint8_t m, uint8_t n, uint8_t o, uint8_t p) { | 339 uint8_t m, uint8_t n, uint8_t o, uint8_t p) { |
340 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; | 340 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; |
341 } | 341 } |
342 | 342 |
343 void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); } | 343 void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } |
344 | 344 |
345 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } | 345 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } |
346 | 346 |
347 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } | 347 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } |
348 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } | 348 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } |
349 | 349 |
350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } | 350 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } |
351 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } | 351 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } |
352 | 352 |
353 template <int k> uint8_t kth() const { | 353 template <int k> uint8_t kth() const { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
394 return vget_low_u16(vmovl_u8(src.fVec)); | 394 return vget_low_u16(vmovl_u8(src.fVec)); |
395 } | 395 } |
396 | 396 |
397 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { | 397 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) { |
398 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); | 398 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); |
399 } | 399 } |
400 | 400 |
401 } // namespace | 401 } // namespace |
402 | 402 |
403 #endif//SkNx_neon_DEFINED | 403 #endif//SkNx_neon_DEFINED |
OLD | NEW |