| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkNx_sse_DEFINED | 8 #ifndef SkNx_sse_DEFINED |
| 9 #define SkNx_sse_DEFINED | 9 #define SkNx_sse_DEFINED |
| 10 | 10 |
| (...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 537 | 537 |
| 538 AI uint32_t operator[](int k) const { | 538 AI uint32_t operator[](int k) const { |
| 539 SkASSERT(0 <= k && k < 8); | 539 SkASSERT(0 <= k && k < 8); |
| 540 union { __m256i v; uint32_t us[8]; } pun = {fVec}; | 540 union { __m256i v; uint32_t us[8]; } pun = {fVec}; |
| 541 return pun.us[k&7]; | 541 return pun.us[k&7]; |
| 542 } | 542 } |
| 543 | 543 |
| 544 __m256i fVec; | 544 __m256i fVec; |
| 545 }; | 545 }; |
| 546 | 546 |
| 547 // _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d. |
| 548 AI static __m256 unpacklo_pd(__m256 x, __m256 y) { |
| 549 return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_c
astps_pd(y))); |
| 550 } |
| 551 AI static __m256 unpackhi_pd(__m256 x, __m256 y) { |
| 552 return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_c
astps_pd(y))); |
| 553 } |
| 554 |
| 547 template <> | 555 template <> |
| 548 class SkNx<8, float> { | 556 class SkNx<8, float> { |
| 549 public: | 557 public: |
| 550 AI SkNx(const __m256& vec) : fVec(vec) {} | 558 AI SkNx(const __m256& vec) : fVec(vec) {} |
| 551 | 559 |
| 552 AI SkNx() {} | 560 AI SkNx() {} |
| 553 AI SkNx(float val) : fVec(_mm256_set1_ps(val)) {} | 561 AI SkNx(float val) : fVec(_mm256_set1_ps(val)) {} |
| 554 AI SkNx(float a, float b, float c, float d, | 562 AI SkNx(float a, float b, float c, float d, |
| 555 float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,
d,e,f,g,h)) {} | 563 float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,
d,e,f,g,h)) {} |
| 556 | 564 |
| 557 AI SkNx(const SkNx_abi<8,float>& a) : fVec(a.vec) {} | 565 AI SkNx(const SkNx_abi<8,float>& a) : fVec(a.vec) {} |
| 558 AI operator SkNx_abi<8,float>() const { return { fVec }; } | 566 AI operator SkNx_abi<8,float>() const { return { fVec }; } |
| 559 | 567 |
| 560 AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const flo
at*)ptr); } | 568 AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const flo
at*)ptr); } |
| 561 AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); } | 569 AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); } |
| 562 | 570 |
| 571 AI static void Store4(void* ptr, |
| 572 const SkNx& r, const SkNx& g, const SkNx& b, const
SkNx& a) { |
| 573 __m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec), // r0 g0 r1 g1
| r4 g4 r5 g5 |
| 574 rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec), // r2 ...
| r6 ... |
| 575 ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec), // b0 a0 b1 a1
| b4 a4 b5 a5 |
| 576 ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec); // b2 ...
| b6 ... |
| 577 |
| 578 __m256 _04 = unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4
a4 |
| 579 _15 = unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... |
| 580 _26 = unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... |
| 581 _37 = unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... |
| 582 |
| 583 __m256 _01 = _mm256_permute2f128_ps(_04, _15, 16), // 16 == 010 000
== lo, lo |
| 584 _23 = _mm256_permute2f128_ps(_26, _37, 16), |
| 585 _45 = _mm256_permute2f128_ps(_04, _15, 25), // 25 == 011 001
== hi, hi |
| 586 _67 = _mm256_permute2f128_ps(_26, _37, 25); |
| 587 |
| 588 _mm256_storeu_ps((float*)ptr + 0*8, _01); |
| 589 _mm256_storeu_ps((float*)ptr + 1*8, _23); |
| 590 _mm256_storeu_ps((float*)ptr + 2*8, _45); |
| 591 _mm256_storeu_ps((float*)ptr + 3*8, _67); |
| 592 } |
| 593 |
| 563 AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fV
ec); } | 594 AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fV
ec); } |
| 564 AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fV
ec); } | 595 AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fV
ec); } |
| 565 AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fV
ec); } | 596 AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fV
ec); } |
| 566 AI SkNx operator/(const SkNx& o) const { return _mm256_div_ps(fVec, o.fV
ec); } | 597 AI SkNx operator/(const SkNx& o) const { return _mm256_div_ps(fVec, o.fV
ec); } |
| 567 | 598 |
| 568 AI SkNx operator==(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_EQ_OQ); } | 599 AI SkNx operator==(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_EQ_OQ); } |
| 569 AI SkNx operator!=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_NEQ_OQ); } | 600 AI SkNx operator!=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_NEQ_OQ); } |
| 570 AI SkNx operator <(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_LT_OQ); } | 601 AI SkNx operator <(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_LT_OQ); } |
| 571 AI SkNx operator >(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_GT_OQ); } | 602 AI SkNx operator >(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_GT_OQ); } |
| 572 AI SkNx operator<=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_LE_OQ); } | 603 AI SkNx operator<=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.f
Vec, _CMP_LE_OQ); } |
| (...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 719 return src.fVec; | 750 return src.fVec; |
| 720 } | 751 } |
| 721 | 752 |
| 722 AI static Sk4i Sk4f_round(const Sk4f& x) { | 753 AI static Sk4i Sk4f_round(const Sk4f& x) { |
| 723 return _mm_cvtps_epi32(x.fVec); | 754 return _mm_cvtps_epi32(x.fVec); |
| 724 } | 755 } |
| 725 | 756 |
| 726 } // namespace | 757 } // namespace |
| 727 | 758 |
| 728 #endif//SkNx_sse_DEFINED | 759 #endif//SkNx_sse_DEFINED |
| OLD | NEW |