| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
| 9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
| 10 | 10 |
| (...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 143 }; | 143 }; |
| 144 | 144 |
| 145 template <> | 145 template <> |
| 146 class SkNx<4, float> { | 146 class SkNx<4, float> { |
| 147 public: | 147 public: |
| 148 SkNx(float32x4_t vec) : fVec(vec) {} | 148 SkNx(float32x4_t vec) : fVec(vec) {} |
| 149 | 149 |
| 150 SkNx() {} | 150 SkNx() {} |
| 151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} | 151 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
| 152 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } | 152 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } |
| 153 static SkNx FromBytes(const uint8_t vals[4]) { | |
| 154 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); | |
| 155 uint16x8_t fix8_16 = vmovl_u8(fix8); | |
| 156 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); | |
| 157 return SkNx(vcvtq_f32_u32(fix8_32)); | |
| 158 } | |
| 159 | |
| 160 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 153 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
| 161 | 154 |
| 162 void store(float vals[4]) const { vst1q_f32(vals, fVec); } | 155 void store(float vals[4]) const { vst1q_f32(vals, fVec); } |
| 163 void toBytes(uint8_t bytes[4]) const { | |
| 164 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); | |
| 165 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); | |
| 166 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); | |
| 167 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); | |
| 168 } | |
| 169 | |
| 170 static void ToBytes(uint8_t bytes[16], | |
| 171 const SkNx& a, const SkNx& b, const SkNx& c, const SkNx&
d) { | |
| 172 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), | |
| 173 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val
[0], | |
| 174 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), | |
| 175 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val
[0]).val[0]); | |
| 176 } | |
| 177 | |
| 178 SkNx approxInvert() const { | 156 SkNx approxInvert() const { |
| 179 float32x4_t est0 = vrecpeq_f32(fVec), | 157 float32x4_t est0 = vrecpeq_f32(fVec), |
| 180 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 158 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
| 181 return est1; | 159 return est1; |
| 182 } | 160 } |
| 183 SkNx invert() const { | 161 SkNx invert() const { |
| 184 float32x4_t est1 = this->approxInvert().fVec, | 162 float32x4_t est1 = this->approxInvert().fVec, |
| 185 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | 163 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); |
| 186 return est2; | 164 return est2; |
| 187 } | 165 } |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 281 } | 259 } |
| 282 | 260 |
| 283 SkNx thenElse(const SkNx& t, const SkNx& e) const { | 261 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
| 284 return vbslq_u16(fVec, t.fVec, e.fVec); | 262 return vbslq_u16(fVec, t.fVec, e.fVec); |
| 285 } | 263 } |
| 286 | 264 |
| 287 uint16x8_t fVec; | 265 uint16x8_t fVec; |
| 288 }; | 266 }; |
| 289 | 267 |
| 290 template <> | 268 template <> |
| 269 class SkNx<4, uint8_t> { |
| 270 public: |
| 271 SkNx(const uint8x8_t& vec) : fVec(vec) {} |
| 272 |
| 273 SkNx() {} |
| 274 static SkNx Load(const uint8_t vals[4]) { |
| 275 return (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); |
| 276 } |
| 277 void store(uint8_t vals[4]) const { |
| 278 return vst1_lane_u32((uint32_t*)vals, (uint32x2_t)fVec, 0); |
| 279 } |
| 280 |
| 281 // TODO as needed |
| 282 |
| 283 uint8x8_t fVec; |
| 284 }; |
| 285 |
| 286 template <> |
| 291 class SkNx<16, uint8_t> { | 287 class SkNx<16, uint8_t> { |
| 292 public: | 288 public: |
| 293 SkNx(const uint8x16_t& vec) : fVec(vec) {} | 289 SkNx(const uint8x16_t& vec) : fVec(vec) {} |
| 294 | 290 |
| 295 SkNx() {} | 291 SkNx() {} |
| 296 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} | 292 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} |
| 297 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } | 293 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } |
| 298 | 294 |
| 299 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 295 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
| 300 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 296 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
| (...skipping 21 matching lines...) Expand all Loading... |
| 322 return vbslq_u8(fVec, t.fVec, e.fVec); | 318 return vbslq_u8(fVec, t.fVec, e.fVec); |
| 323 } | 319 } |
| 324 | 320 |
| 325 uint8x16_t fVec; | 321 uint8x16_t fVec; |
| 326 }; | 322 }; |
| 327 | 323 |
| 328 #undef SHIFT32 | 324 #undef SHIFT32 |
| 329 #undef SHIFT16 | 325 #undef SHIFT16 |
| 330 #undef SHIFT8 | 326 #undef SHIFT8 |
| 331 | 327 |
| 332 template<> | 328 template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { |
| 333 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { | |
| 334 return vcvtq_s32_f32(src.fVec); | 329 return vcvtq_s32_f32(src.fVec); |
| 335 } | 330 } |
| 336 | 331 |
| 332 template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { |
| 333 uint32x4_t _32 = vcvtq_u32_f32(src.fVec); |
| 334 uint16x4_t _16 = vqmovn_u32(_32); |
| 335 return vqmovn_u16(vcombine_u16(_16, _16)); |
| 336 } |
| 337 |
| 338 template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { |
| 339 uint16x8_t _16 = vmovl_u8 (src.fVec) ; |
| 340 uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); |
| 341 return vcvtq_f32_u32(_32); |
| 342 } |
| 343 |
| 344 static inline void Sk4f_ToBytes(uint8_t bytes[16], |
| 345 const Sk4f& a, const Sk4f& b, const Sk4f& c, con
st Sk4f& d) { |
| 346 vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), |
| 347 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], |
| 348 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), |
| 349 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0])
.val[0]); |
| 350 } |
| 351 |
| 337 } // namespace | 352 } // namespace |
| 338 | 353 |
| 339 #endif//SkNx_neon_DEFINED | 354 #endif//SkNx_neon_DEFINED |
| OLD | NEW |