OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 }; | 143 }; |
144 | 144 |
145 template <> | 145 template <> |
146 class SkNx<4, float> { | 146 class SkNx<4, float> { |
147 public: | 147 public: |
148 SkNx(float32x4_t vec) : fVec(vec) {} | 148 SkNx(float32x4_t vec) : fVec(vec) {} |
149 | 149 |
150 SkNx() {} | 150 SkNx() {} |
151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
152 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } | 152 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } |
153 static SkNx FromBytes(const uint8_t vals[4]) { | |
154 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); | |
155 uint16x8_t fix8_16 = vmovl_u8(fix8); | |
156 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); | |
157 return SkNx(vcvtq_f32_u32(fix8_32)); | |
158 } | |
159 | |
160 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
161 | 154 |
162 void store(float vals[4]) const { vst1q_f32(vals, fVec); } | 155 void store(float vals[4]) const { vst1q_f32(vals, fVec); } |
163 void toBytes(uint8_t bytes[4]) const { | |
164 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); | |
165 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); | |
166 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); | |
167 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); | |
168 } | |
169 | |
170 static void ToBytes(uint8_t bytes[16], | |
171 const SkNx& a, const SkNx& b, const SkNx& c, const SkNx&
d) { | |
172 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), | |
173 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val
[0], | |
174 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), | |
175 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val
[0]).val[0]); | |
176 } | |
177 | |
178 SkNx approxInvert() const { | 156 SkNx approxInvert() const { |
179 float32x4_t est0 = vrecpeq_f32(fVec), | 157 float32x4_t est0 = vrecpeq_f32(fVec), |
180 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 158 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
181 return est1; | 159 return est1; |
182 } | 160 } |
183 SkNx invert() const { | 161 SkNx invert() const { |
184 float32x4_t est1 = this->approxInvert().fVec, | 162 float32x4_t est1 = this->approxInvert().fVec, |
185 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | 163 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); |
186 return est2; | 164 return est2; |
187 } | 165 } |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
281 } | 259 } |
282 | 260 |
283 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 261 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
284 return vbslq_u16(fVec, t.fVec, e.fVec); | 262 return vbslq_u16(fVec, t.fVec, e.fVec); |
285 } | 263 } |
286 | 264 |
287 uint16x8_t fVec; | 265 uint16x8_t fVec; |
288 }; | 266 }; |
289 | 267 |
290 template <> | 268 template <> |
| 269 class SkNx<4, uint8_t> { |
| 270 public: |
| 271 SkNx(const uint8x8_t& vec) : fVec(vec) {} |
| 272 |
| 273 SkNx() {} |
| 274 static SkNx Load(const uint8_t vals[4]) { |
| 275 return (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); |
| 276 } |
| 277 void store(uint8_t vals[4]) const { |
| 278 return vst1_lane_u32((uint32_t*)vals, (uint32x2_t)fVec, 0); |
| 279 } |
| 280 |
| 281 // TODO as needed |
| 282 |
| 283 uint8x8_t fVec; |
| 284 }; |
| 285 |
| 286 template <> |
291 class SkNx<16, uint8_t> { | 287 class SkNx<16, uint8_t> { |
292 public: | 288 public: |
293 SkNx(const uint8x16_t& vec) : fVec(vec) {} | 289 SkNx(const uint8x16_t& vec) : fVec(vec) {} |
294 | 290 |
295 SkNx() {} | 291 SkNx() {} |
296 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} | 292 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} |
297 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } | 293 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } |
298 | 294 |
299 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 295 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
300 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 296 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
(...skipping 21 matching lines...) Expand all Loading... |
322 return vbslq_u8(fVec, t.fVec, e.fVec); | 318 return vbslq_u8(fVec, t.fVec, e.fVec); |
323 } | 319 } |
324 | 320 |
325 uint8x16_t fVec; | 321 uint8x16_t fVec; |
326 }; | 322 }; |
327 | 323 |
328 #undef SHIFT32 | 324 #undef SHIFT32 |
329 #undef SHIFT16 | 325 #undef SHIFT16 |
330 #undef SHIFT8 | 326 #undef SHIFT8 |
331 | 327 |
332 template<> | 328 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { |
333 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { | |
334 return vcvtq_s32_f32(src.fVec); | 329 return vcvtq_s32_f32(src.fVec); |
335 } | 330 } |
336 | 331 |
| 332 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { |
| 333 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); |
| 334 uint16x4_t _16 = vqmovn_u32(_32); |
| 335 return vqmovn_u16(vcombine_u16(_16, _16)); |
| 336 } |
| 337 |
| 338 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { |
| 339 uint16x8_t _16 = vmovl_u8 (src.fVec) ; |
| 340 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); |
| 341 return vcvtq_f32_u32(_32); |
| 342 } |
| 343 |
| 344 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 345 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 346 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), |
| 347 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], |
| 348 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), |
| 349 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); |
| 350 } |
| 351 |
337 } // namespace | 352 } // namespace |
338 | 353 |
339 #endif//SkNx_neon_DEFINED | 354 #endif//SkNx_neon_DEFINED |
OLD | NEW |