| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 // It is important _not_ to put header guards here. |  | 
| 2 // This file will be intentionally included three times. |  | 
| 3 |  | 
| 4 // Useful reading: |  | 
| 5 //   https://software.intel.com/sites/landingpage/IntrinsicsGuide/ |  | 
| 6 |  | 
| 7 #if defined(SK4X_PREAMBLE) |  | 
| 8     // Code in this file may assume SSE and SSE2. |  | 
| 9     #include <emmintrin.h> |  | 
| 10 |  | 
| 11     // It must check for later instruction sets. |  | 
| 12     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |  | 
| 13         #include <immintrin.h> |  | 
| 14     #endif |  | 
| 15 |  | 
| 16     // A little bit of template metaprogramming to map |  | 
| 17     // float to __m128 and int32_t to __m128i. |  | 
| 18     template <typename T> struct SkScalarToSIMD; |  | 
| 19     template <> struct SkScalarToSIMD<float>   { typedef __m128  Type; }; |  | 
| 20     template <> struct SkScalarToSIMD<int32_t> { typedef __m128i Type; }; |  | 
| 21 |  | 
| 22     // These are all free.  MSVC insists we use _mm_castA_B(a) instead of (B)a. |  | 
| 23     __m128  as_4f(__m128i v) { return _mm_castsi128_ps(v); } |  | 
| 24     __m128  as_4f(__m128  v) { return                  v ; } |  | 
| 25     __m128i as_4i(__m128i v) { return                  v ; } |  | 
| 26     __m128i as_4i(__m128  v) { return _mm_castps_si128(v); } |  | 
| 27 |  | 
| 28 #elif defined(SK4X_PRIVATE) |  | 
| 29     // The best (1 op) way to get all -1s in a register. Our compilers are a lit
     tle too cautious... |  | 
| 30     static __m128i True()  { |  | 
| 31     #ifdef __GNUC__ |  | 
| 32         #pragma GCC diagnostic push |  | 
| 33         #pragma GCC diagnostic ignored "-Wuninitialized" |  | 
| 34             __m128i uninitialized; |  | 
| 35             return _mm_cmpeq_epi32(uninitialized, uninitialized); |  | 
| 36         #pragma GCC diagnostic pop |  | 
| 37     #else |  | 
| 38         // Can't figure out how to suppress C4700 from MSVC.  Oh well, we'll be 
     a little slower. |  | 
| 39         __m128i zero = _mm_setzero_si128(); |  | 
| 40         return _mm_cmpeq_epi32(zero, zero); |  | 
| 41     #endif |  | 
| 42     } |  | 
| 43 |  | 
| 44     // Leaving these implicit makes the rest of the code below a bit less noisy 
     to read. |  | 
| 45     Sk4x(__m128i); |  | 
| 46     Sk4x(__m128); |  | 
| 47 |  | 
| 48     Sk4x andNot(const Sk4x&) const; |  | 
| 49 |  | 
| 50     typename SkScalarToSIMD<T>::Type fVec; |  | 
| 51 |  | 
| 52 #else//Method definitions. |  | 
| 53 |  | 
| 54 // Helps to get these in before anything else. |  | 
| 55 template <> inline Sk4f::Sk4x(__m128i v) : fVec(as_4f(v)) {} |  | 
| 56 template <> inline Sk4f::Sk4x(__m128  v) : fVec(      v ) {} |  | 
| 57 template <> inline Sk4i::Sk4x(__m128i v) : fVec(      v ) {} |  | 
| 58 template <> inline Sk4i::Sk4x(__m128  v) : fVec(as_4i(v)) {} |  | 
| 59 |  | 
| 60 // Next, methods whose implementation is the same for Sk4f and Sk4i. |  | 
| 61 template <typename T> Sk4x<T>::Sk4x() {} |  | 
| 62 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; } |  | 
| 63 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { |  | 
| 64     fVec = other.fVec; |  | 
| 65     return *this; |  | 
| 66 } |  | 
| 67 |  | 
| 68 // We pun in these _mm_shuffle_* methods a little to use the fastest / most avai
     lable methods. |  | 
| 69 // They're all bit-preserving operations so it shouldn't matter. |  | 
| 70 |  | 
| 71 template <typename T> |  | 
| 72 Sk4x<T> Sk4x<T>::zwxy() const { return _mm_shuffle_epi32(as_4i(fVec), _MM_SHUFFL
     E(1,0,3,2)); } |  | 
| 73 |  | 
| 74 template <typename T> |  | 
| 75 Sk4x<T> Sk4x<T>::XYAB(const Sk4x<T>& a, const Sk4x<T>& b) { |  | 
| 76     return _mm_movelh_ps(as_4f(a.fVec), as_4f(b.fVec)); |  | 
| 77 } |  | 
| 78 |  | 
| 79 template <typename T> |  | 
| 80 Sk4x<T> Sk4x<T>::ZWCD(const Sk4x<T>& a, const Sk4x<T>& b) { |  | 
| 81     return _mm_movehl_ps(as_4f(b.fVec), as_4f(a.fVec)); |  | 
| 82 } |  | 
| 83 |  | 
| 84 // Now we'll write all Sk4f specific methods.  This M() macro will remove some n
     oise. |  | 
| 85 #define M(...) template <> inline __VA_ARGS__ Sk4f:: |  | 
| 86 |  | 
| 87 M() Sk4x(float a, float b, float c, float d) : fVec(_mm_set_ps(d,c,b,a)) {} |  | 
| 88 |  | 
| 89 M(Sk4f) Load       (const float fs[4]) { return _mm_loadu_ps(fs); } |  | 
| 90 M(Sk4f) LoadAligned(const float fs[4]) { return _mm_load_ps (fs); } |  | 
| 91 |  | 
| 92 M(void) store       (float fs[4]) const { _mm_storeu_ps(fs, fVec); } |  | 
| 93 M(void) storeAligned(float fs[4]) const { _mm_store_ps (fs, fVec); } |  | 
| 94 |  | 
| 95 template <> template <> |  | 
| 96 Sk4i Sk4f::reinterpret<Sk4i>() const { return as_4i(fVec); } |  | 
| 97 |  | 
| 98 template <> template <> |  | 
| 99 Sk4i Sk4f::cast<Sk4i>() const { return _mm_cvtps_epi32(fVec); } |  | 
| 100 |  | 
| 101 // We're going to try a little experiment here and skip allTrue(), anyTrue(), an
     d bit-manipulators |  | 
| 102 // for Sk4f.  Code that calls them probably does so accidentally. |  | 
| 103 // Ask mtklein to fill these in if you really need them. |  | 
| 104 |  | 
| 105 M(Sk4f) add     (const Sk4f& o) const { return _mm_add_ps(fVec, o.fVec); } |  | 
| 106 M(Sk4f) subtract(const Sk4f& o) const { return _mm_sub_ps(fVec, o.fVec); } |  | 
| 107 M(Sk4f) multiply(const Sk4f& o) const { return _mm_mul_ps(fVec, o.fVec); } |  | 
| 108 M(Sk4f) divide  (const Sk4f& o) const { return _mm_div_ps(fVec, o.fVec); } |  | 
| 109 |  | 
| 110 M(Sk4i) equal           (const Sk4f& o) const { return _mm_cmpeq_ps (fVec, o.fVe
     c); } |  | 
| 111 M(Sk4i) notEqual        (const Sk4f& o) const { return _mm_cmpneq_ps(fVec, o.fVe
     c); } |  | 
| 112 M(Sk4i) lessThan        (const Sk4f& o) const { return _mm_cmplt_ps (fVec, o.fVe
     c); } |  | 
| 113 M(Sk4i) greaterThan     (const Sk4f& o) const { return _mm_cmpgt_ps (fVec, o.fVe
     c); } |  | 
| 114 M(Sk4i) lessThanEqual   (const Sk4f& o) const { return _mm_cmple_ps (fVec, o.fVe
     c); } |  | 
| 115 M(Sk4i) greaterThanEqual(const Sk4f& o) const { return _mm_cmpge_ps (fVec, o.fVe
     c); } |  | 
| 116 |  | 
| 117 M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return _mm_min_ps(a.fVec, b.fVec); } |  | 
| 118 M(Sk4f) Max(const Sk4f& a, const Sk4f& b) { return _mm_max_ps(a.fVec, b.fVec); } |  | 
| 119 |  | 
| 120 // Now we'll write all the Sk4i specific methods.  Same deal for M(). |  | 
| 121 #undef M |  | 
| 122 #define M(...) template <> inline __VA_ARGS__ Sk4i:: |  | 
| 123 |  | 
| 124 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_set_epi32(d,c,b,
     a)) {} |  | 
| 125 |  | 
| 126 M(Sk4i) Load       (const int32_t is[4]) { return _mm_loadu_si128((const __m128i
     *)is); } |  | 
| 127 M(Sk4i) LoadAligned(const int32_t is[4]) { return _mm_load_si128 ((const __m128i
     *)is); } |  | 
| 128 |  | 
| 129 M(void) store       (int32_t is[4]) const { _mm_storeu_si128((__m128i*)is, fVec)
     ; } |  | 
| 130 M(void) storeAligned(int32_t is[4]) const { _mm_store_si128 ((__m128i*)is, fVec)
     ; } |  | 
| 131 |  | 
| 132 template <> template <> |  | 
| 133 Sk4f Sk4i::reinterpret<Sk4f>() const { return as_4f(fVec); } |  | 
| 134 |  | 
| 135 template <> template <> |  | 
| 136 Sk4f Sk4i::cast<Sk4f>() const { return _mm_cvtepi32_ps(fVec); } |  | 
| 137 |  | 
| 138 M(bool) allTrue() const { return 0xf == _mm_movemask_ps(as_4f(fVec)); } |  | 
| 139 M(bool) anyTrue() const { return 0x0 != _mm_movemask_ps(as_4f(fVec)); } |  | 
| 140 |  | 
| 141 M(Sk4i) bitNot() const { return _mm_xor_si128(fVec, True()); } |  | 
| 142 M(Sk4i) bitAnd(const Sk4i& o) const { return _mm_and_si128(fVec, o.fVec); } |  | 
| 143 M(Sk4i) bitOr (const Sk4i& o) const { return _mm_or_si128 (fVec, o.fVec); } |  | 
| 144 |  | 
| 145 M(Sk4i) equal           (const Sk4i& o) const { return _mm_cmpeq_epi32 (fVec, o.
     fVec); } |  | 
| 146 M(Sk4i) lessThan        (const Sk4i& o) const { return _mm_cmplt_epi32 (fVec, o.
     fVec); } |  | 
| 147 M(Sk4i) greaterThan     (const Sk4i& o) const { return _mm_cmpgt_epi32 (fVec, o.
     fVec); } |  | 
| 148 M(Sk4i) notEqual        (const Sk4i& o) const { return this->      equal(o).bitN
     ot();  } |  | 
| 149 M(Sk4i) lessThanEqual   (const Sk4i& o) const { return this->greaterThan(o).bitN
     ot();  } |  | 
| 150 M(Sk4i) greaterThanEqual(const Sk4i& o) const { return this->   lessThan(o).bitN
     ot();  } |  | 
| 151 |  | 
| 152 M(Sk4i) add     (const Sk4i& o) const { return _mm_add_epi32(fVec, o.fVec); } |  | 
| 153 M(Sk4i) subtract(const Sk4i& o) const { return _mm_sub_epi32(fVec, o.fVec); } |  | 
| 154 |  | 
| 155 // SSE doesn't have integer division.  Let's see how far we can get without Sk4i
     ::divide(). |  | 
| 156 |  | 
| 157 // Sk4i's multiply(), Min(), and Max() all improve significantly with SSE4.1. |  | 
| 158 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |  | 
| 159     M(Sk4i) multiply(const Sk4i& o) const { return _mm_mullo_epi32(fVec, o.fVec)
     ; } |  | 
| 160     M(Sk4i) Min(const Sk4i& a, const Sk4i& b) { return _mm_min_epi32(a.fVec, b.f
     Vec); } |  | 
| 161     M(Sk4i) Max(const Sk4i& a, const Sk4i& b) { return _mm_max_epi32(a.fVec, b.f
     Vec); } |  | 
| 162 #else |  | 
| 163     M(Sk4i) multiply(const Sk4i& o) const { |  | 
| 164         // First 2 32->64 bit multiplies. |  | 
| 165         __m128i mul02 = _mm_mul_epu32(fVec, o.fVec), |  | 
| 166                 mul13 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.
     fVec, 4)); |  | 
| 167         // Now recombine the high bits of the two products. |  | 
| 168         return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul02, _MM_SHUFFLE(0,0,2,0))
     , |  | 
| 169                                   _mm_shuffle_epi32(mul13, _MM_SHUFFLE(0,0,2,0))
     ); |  | 
| 170     } |  | 
| 171 |  | 
| 172     M(Sk4i) andNot(const Sk4i& o) const { return _mm_andnot_si128(o.fVec, fVec);
      } |  | 
| 173 |  | 
| 174     M(Sk4i) Min(const Sk4i& a, const Sk4i& b) { |  | 
| 175         Sk4i less = a.lessThan(b); |  | 
| 176         return a.bitAnd(less).bitOr(b.andNot(less)); |  | 
| 177     } |  | 
| 178     M(Sk4i) Max(const Sk4i& a, const Sk4i& b) { |  | 
| 179         Sk4i less = a.lessThan(b); |  | 
| 180         return b.bitAnd(less).bitOr(a.andNot(less)); |  | 
| 181     } |  | 
| 182 #endif |  | 
| 183 |  | 
| 184 #undef M |  | 
| 185 |  | 
| 186 #endif//Method definitions. |  | 
| OLD | NEW | 
|---|