OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
47 public: | 47 public: |
48 SkNx(float32x2_t vec) : fVec(vec) {} | 48 SkNx(float32x2_t vec) : fVec(vec) {} |
49 | 49 |
50 SkNx() {} | 50 SkNx() {} |
51 SkNx(float val) : fVec(vdup_n_f32(val)) {} | 51 SkNx(float val) : fVec(vdup_n_f32(val)) {} |
52 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } | 52 static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } |
53 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } | 53 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } |
54 | 54 |
55 void store(void* ptr) const { vst1_f32((float*)ptr, fVec); } | 55 void store(void* ptr) const { vst1_f32((float*)ptr, fVec); } |
56 | 56 |
57 SkNx approxInvert() const { | 57 SkNx invert() const { |
58 float32x2_t est0 = vrecpe_f32(fVec), | 58 float32x2_t est0 = vrecpe_f32(fVec), |
59 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); | 59 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); |
60 return est1; | 60 return est1; |
61 } | 61 } |
62 SkNx invert() const { | |
63 float32x2_t est1 = this->approxInvert().fVec, | |
64 est2 = vmul_f32(vrecps_f32(est1, fVec), est1); | |
65 return est2; | |
66 } | |
67 | 62 |
68 SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); } | 63 SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); } |
69 SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); } | 64 SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); } |
70 SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); } | 65 SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); } |
71 SkNx operator / (const SkNx& o) const { | 66 SkNx operator / (const SkNx& o) const { |
72 #if defined(SK_CPU_ARM64) | 67 #if defined(SK_CPU_ARM64) |
73 return vdiv_f32(fVec, o.fVec); | 68 return vdiv_f32(fVec, o.fVec); |
74 #else | 69 #else |
75 return vmul_f32(fVec, o.invert().fVec); | 70 float32x2_t est0 = vrecpe_f32(o.fVec), |
| 71 est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0), |
| 72 est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1); |
| 73 return vmul_f32(fVec, est2); |
76 #endif | 74 #endif |
77 } | 75 } |
78 | 76 |
79 SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f3
2(fVec, o.fVec)); } | 77 SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f3
2(fVec, o.fVec)); } |
80 SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f3
2(fVec, o.fVec)); } | 78 SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f3
2(fVec, o.fVec)); } |
81 SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f3
2(fVec, o.fVec)); } | 79 SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f3
2(fVec, o.fVec)); } |
82 SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f3
2(fVec, o.fVec)); } | 80 SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f3
2(fVec, o.fVec)); } |
83 SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f3
2(fVec, o.fVec)); } | 81 SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f3
2(fVec, o.fVec)); } |
84 SkNx operator != (const SkNx& o) const { | 82 SkNx operator != (const SkNx& o) const { |
85 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); | 83 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); |
86 } | 84 } |
87 | 85 |
88 static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fV
ec); } | 86 static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fV
ec); } |
89 static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fV
ec); } | 87 static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fV
ec); } |
90 | 88 |
91 SkNx rsqrt0() const { return vrsqrte_f32(fVec); } | 89 SkNx rsqrt() const { |
92 SkNx rsqrt1() const { | 90 float32x2_t est0 = vrsqrte_f32(fVec); |
93 float32x2_t est0 = this->rsqrt0().fVec; | |
94 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0); | 91 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0); |
95 } | 92 } |
96 SkNx rsqrt2() const { | |
97 float32x2_t est1 = this->rsqrt1().fVec; | |
98 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1); | |
99 } | |
100 | 93 |
101 SkNx sqrt() const { | 94 SkNx sqrt() const { |
102 #if defined(SK_CPU_ARM64) | 95 #if defined(SK_CPU_ARM64) |
103 return vsqrt_f32(fVec); | 96 return vsqrt_f32(fVec); |
104 #else | 97 #else |
105 return *this * this->rsqrt2(); | 98 float32x2_t est0 = vrsqrte_f32(fVec), |
| 99 est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est
0), |
| 100 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est
1); |
| 101 return vmul_f32(fVec, est2); |
106 #endif | 102 #endif |
107 } | 103 } |
108 | 104 |
109 float operator[](int k) const { | 105 float operator[](int k) const { |
110 SkASSERT(0 <= k && k < 2); | 106 SkASSERT(0 <= k && k < 2); |
111 union { float32x2_t v; float fs[2]; } pun = {fVec}; | 107 union { float32x2_t v; float fs[2]; } pun = {fVec}; |
112 return pun.fs[k&1]; | 108 return pun.fs[k&1]; |
113 } | 109 } |
114 | 110 |
115 bool allTrue() const { | 111 bool allTrue() const { |
(...skipping 12 matching lines...) Expand all Loading... |
128 class SkNx<4, float> { | 124 class SkNx<4, float> { |
129 public: | 125 public: |
130 SkNx(float32x4_t vec) : fVec(vec) {} | 126 SkNx(float32x4_t vec) : fVec(vec) {} |
131 | 127 |
132 SkNx() {} | 128 SkNx() {} |
133 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 129 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
134 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } | 130 static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } |
135 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 131 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
136 | 132 |
137 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } | 133 void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } |
138 SkNx approxInvert() const { | 134 SkNx invert() const { |
139 float32x4_t est0 = vrecpeq_f32(fVec), | 135 float32x4_t est0 = vrecpeq_f32(fVec), |
140 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 136 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
141 return est1; | 137 return est1; |
142 } | 138 } |
143 SkNx invert() const { | |
144 float32x4_t est1 = this->approxInvert().fVec, | |
145 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | |
146 return est2; | |
147 } | |
148 | 139 |
149 SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); } | 140 SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); } |
150 SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); } | 141 SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); } |
151 SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); } | 142 SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); } |
152 SkNx operator / (const SkNx& o) const { | 143 SkNx operator / (const SkNx& o) const { |
153 #if defined(SK_CPU_ARM64) | 144 #if defined(SK_CPU_ARM64) |
154 return vdivq_f32(fVec, o.fVec); | 145 return vdivq_f32(fVec, o.fVec); |
155 #else | 146 #else |
156 return vmulq_f32(fVec, o.invert().fVec); | 147 float32x4_t est0 = vrecpeq_f32(o.fVec), |
| 148 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0), |
| 149 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1); |
| 150 return vmulq_f32(fVec, est2); |
157 #endif | 151 #endif |
158 } | 152 } |
159 | 153 |
160 SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f3
2(fVec, o.fVec)); } | 154 SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f3
2(fVec, o.fVec)); } |
161 SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f3
2(fVec, o.fVec)); } | 155 SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f3
2(fVec, o.fVec)); } |
162 SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f3
2(fVec, o.fVec)); } | 156 SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f3
2(fVec, o.fVec)); } |
163 SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f3
2(fVec, o.fVec)); } | 157 SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f3
2(fVec, o.fVec)); } |
164 SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f3
2(fVec, o.fVec)); } | 158 SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f3
2(fVec, o.fVec)); } |
165 SkNx operator!=(const SkNx& o) const { | 159 SkNx operator!=(const SkNx& o) const { |
166 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); | 160 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); |
167 } | 161 } |
168 | 162 |
169 static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.f
Vec); } | 163 static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.f
Vec); } |
170 static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.f
Vec); } | 164 static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.f
Vec); } |
171 | 165 |
172 SkNx abs() const { return vabsq_f32(fVec); } | 166 SkNx abs() const { return vabsq_f32(fVec); } |
173 SkNx floor() const { | 167 SkNx floor() const { |
174 #if defined(SK_CPU_ARM64) | 168 #if defined(SK_CPU_ARM64) |
175 return vrndmq_f32(fVec); | 169 return vrndmq_f32(fVec); |
176 #else | 170 #else |
177 return armv7_vrndmq_f32(fVec); | 171 return armv7_vrndmq_f32(fVec); |
178 #endif | 172 #endif |
179 } | 173 } |
180 | 174 |
181 | 175 |
182 SkNx rsqrt0() const { return vrsqrteq_f32(fVec); } | 176 SkNx rsqrt() const { |
183 SkNx rsqrt1() const { | 177 float32x4_t est0 = vrsqrteq_f32(fVec); |
184 float32x4_t est0 = this->rsqrt0().fVec; | |
185 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); | 178 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); |
186 } | 179 } |
187 SkNx rsqrt2() const { | |
188 float32x4_t est1 = this->rsqrt1().fVec; | |
189 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); | |
190 } | |
191 | 180 |
192 SkNx sqrt() const { | 181 SkNx sqrt() const { |
193 #if defined(SK_CPU_ARM64) | 182 #if defined(SK_CPU_ARM64) |
194 return vsqrtq_f32(fVec); | 183 return vsqrtq_f32(fVec); |
195 #else | 184 #else |
196 return *this * this->rsqrt2(); | 185 float32x4_t est0 = vrsqrteq_f32(fVec), |
| 186 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)),
est0), |
| 187 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)),
est1); |
| 188 return vmulq_f32(fVec, est2); |
197 #endif | 189 #endif |
198 } | 190 } |
199 | 191 |
200 float operator[](int k) const { | 192 float operator[](int k) const { |
201 SkASSERT(0 <= k && k < 4); | 193 SkASSERT(0 <= k && k < 4); |
202 union { float32x4_t v; float fs[4]; } pun = {fVec}; | 194 union { float32x4_t v; float fs[4]; } pun = {fVec}; |
203 return pun.fs[k&3]; | 195 return pun.fs[k&3]; |
204 } | 196 } |
205 | 197 |
206 bool allTrue() const { | 198 bool allTrue() const { |
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
357 return pun.us[k&15]; | 349 return pun.us[k&15]; |
358 } | 350 } |
359 | 351 |
360 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 352 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
361 return vbslq_u8(fVec, t.fVec, e.fVec); | 353 return vbslq_u8(fVec, t.fVec, e.fVec); |
362 } | 354 } |
363 | 355 |
364 uint8x16_t fVec; | 356 uint8x16_t fVec; |
365 }; | 357 }; |
366 | 358 |
| 359 template <> |
| 360 class SkNx<4, int> { |
| 361 public: |
| 362 SkNx(const int32x4_t& vec) : fVec(vec) {} |
| 363 |
| 364 SkNx() {} |
| 365 SkNx(int v) { |
| 366 fVec = vdupq_n_s32(v); |
| 367 } |
| 368 SkNx(int a, int b, int c, int d) { |
| 369 fVec = (int32x4_t){a,b,c,d}; |
| 370 } |
| 371 static SkNx Load(const void* ptr) { |
| 372 return vld1q_s32((const int32_t*)ptr); |
| 373 } |
| 374 void store(void* ptr) const { |
| 375 return vst1q_s32((int32_t*)ptr, fVec); |
| 376 } |
| 377 int operator[](int k) const { |
| 378 SkASSERT(0 <= k && k < 4); |
| 379 union { int32x4_t v; int is[4]; } pun = {fVec}; |
| 380 return pun.is[k&3]; |
| 381 } |
| 382 |
| 383 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } |
| 384 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } |
| 385 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } |
| 386 |
| 387 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } |
| 388 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } |
| 389 |
| 390 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.f
Vec); } |
| 391 // TODO as needed |
| 392 |
| 393 int32x4_t fVec; |
| 394 }; |
| 395 |
367 #undef SHIFT32 | 396 #undef SHIFT32 |
368 #undef SHIFT16 | 397 #undef SHIFT16 |
369 #undef SHIFT8 | 398 #undef SHIFT8 |
370 | 399 |
| 400 template<> inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { |
| 401 return vcvtq_s32_f32(src.fVec); |
| 402 |
| 403 } |
| 404 template<> inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { |
| 405 return vcvtq_f32_s32(src.fVec); |
| 406 } |
| 407 |
371 template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { | 408 template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { |
372 return vqmovn_u32(vcvtq_u32_f32(src.fVec)); | 409 return vqmovn_u32(vcvtq_u32_f32(src.fVec)); |
373 } | 410 } |
374 | 411 |
375 template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { | 412 template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { |
376 return vcvtq_f32_u32(vmovl_u16(src.fVec)); | 413 return vcvtq_f32_u32(vmovl_u16(src.fVec)); |
377 } | 414 } |
378 | 415 |
379 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { | 416 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { |
380 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); | 417 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); |
381 uint16x4_t _16 = vqmovn_u32(_32); | 418 uint16x4_t _16 = vqmovn_u32(_32); |
382 return vqmovn_u16(vcombine_u16(_16, _16)); | 419 return vqmovn_u16(vcombine_u16(_16, _16)); |
383 } | 420 } |
384 | 421 |
385 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { | 422 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { |
386 uint16x8_t _16 = vmovl_u8 (src.fVec) ; | 423 uint16x8_t _16 = vmovl_u8 (src.fVec) ; |
387 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); | 424 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); |
388 return vcvtq_f32_u32(_32); | 425 return vcvtq_f32_u32(_32); |
389 } | 426 } |
390 | 427 |
391 static inline void Sk4f_ToBytes(uint8_t bytes[16], | 428 template<> inline Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) { |
392 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { | 429 Sk8f ab, cd; |
393 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), | 430 SkNx_split(src, &ab, &cd); |
394 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], | 431 |
395 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), | 432 Sk4f a,b,c,d; |
396 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); | 433 SkNx_split(ab, &a, &b); |
| 434 SkNx_split(cd, &c, &d); |
| 435 return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), |
| 436 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], |
| 437 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), |
| 438 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]; |
397 } | 439 } |
398 | 440 |
399 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { | 441 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { |
400 return vget_low_u16(vmovl_u8(src.fVec)); | 442 return vget_low_u16(vmovl_u8(src.fVec)); |
401 } | 443 } |
402 | 444 |
403 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { | 445 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { |
404 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); | 446 return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); |
405 } | 447 } |
406 | 448 |
407 #endif//SkNx_neon_DEFINED | 449 #endif//SkNx_neon_DEFINED |
OLD | NEW |