| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2015 Google Inc. | 2  * Copyright 2015 Google Inc. | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #include "SkOpts.h" | 8 #include "SkOpts.h" | 
| 9 | 9 | 
| 10 #define SK_OPTS_NS sk_sse41 | 10 #define SK_OPTS_NS sk_sse41 | 
| 11 #include "SkBlurImageFilter_opts.h" | 11 #include "SkBlurImageFilter_opts.h" | 
| 12 | 12 | 
| 13 #ifndef SK_SUPPORT_LEGACY_X86_BLITS | 13 #ifndef SK_SUPPORT_LEGACY_X86_BLITS | 
| 14 | 14 | 
| 15 // This file deals mostly with unpacked 8-bit values, | 15 namespace sk_sse41 { | 
| 16 // i.e. values between 0 and 255, but in 16-bit lanes with 0 at the top. |  | 
| 17 | 16 | 
| 18 // So __m128i typically represents 1 or 2 pixels, and m128ix2 represents 4. | 17 // An SSE register holding at most 64 bits of useful data in the low lanes. | 
| 19 struct m128ix2 { __m128i lo, hi; }; | 18 struct m64i { | 
|  | 19     __m128i v; | 
|  | 20     /*implicit*/ m64i(__m128i v) : v(v) {} | 
|  | 21     operator __m128i() const { return v; } | 
|  | 22 }; | 
| 20 | 23 | 
| 21 // unpack{lo,hi}() get our raw pixels unpacked, from half of 4 packed pixels to 
     2 unpacked pixels. | 24 // Load 4, 2, or 1 constant pixels or coverages (4x replicated). | 
| 22 static inline __m128i unpacklo(__m128i x) { return _mm_cvtepu8_epi16(x); } | 25 static __m128i next4(uint32_t val) { return _mm_set1_epi32(val); } | 
| 23 static inline __m128i unpackhi(__m128i x) { return _mm_unpackhi_epi8(x, _mm_setz
     ero_si128()); } | 26 static m64i    next2(uint32_t val) { return _mm_set1_epi32(val); } | 
|  | 27 static m64i    next1(uint32_t val) { return _mm_set1_epi32(val); } | 
| 24 | 28 | 
| 25 // pack() converts back, from 4 unpacked pixels to 4 packed pixels. | 29 static __m128i next4(uint8_t val) { return _mm_set1_epi8(val); } | 
| 26 static inline __m128i pack(__m128i lo, __m128i hi) { return _mm_packus_epi16(lo,
      hi); } | 30 static m64i    next2(uint8_t val) { return _mm_set1_epi8(val); } | 
|  | 31 static m64i    next1(uint8_t val) { return _mm_set1_epi8(val); } | 
| 27 | 32 | 
| 28 // These nextN() functions abstract over the difference between iterating over | 33 // Load 4, 2, or 1 variable pixels or coverages (4x replicated), | 
| 29 // an array of values and returning a constant value, for uint8_t and uint32_t. | 34 // incrementing the pointer past what we read. | 
| 30 // The nextN() taking pointers increment that pointer past where they read. | 35 static __m128i next4(const uint32_t*& ptr) { | 
| 31 // | 36     auto r = _mm_loadu_si128((const __m128i*)ptr); | 
| 32 // nextN() returns N unpacked pixels or 4N unpacked coverage values. | 37     ptr += 4; | 
| 33 | 38     return r; | 
| 34 static inline __m128i next1(uint8_t val) { return _mm_set1_epi16(val); } |  | 
| 35 static inline __m128i next2(uint8_t val) { return _mm_set1_epi16(val); } |  | 
| 36 static inline m128ix2 next4(uint8_t val) { return { next2(val), next2(val) }; } |  | 
| 37 |  | 
| 38 static inline __m128i next1(uint32_t val) { return unpacklo(_mm_cvtsi32_si128(va
     l)); } |  | 
| 39 static inline __m128i next2(uint32_t val) { return unpacklo(_mm_set1_epi32(val))
     ; } |  | 
| 40 static inline m128ix2 next4(uint32_t val) { return { next2(val), next2(val) }; } |  | 
| 41 |  | 
| 42 static inline __m128i next1(const uint8_t*& ptr) { return _mm_set1_epi16(*ptr++)
     ; } |  | 
| 43 static inline __m128i next2(const uint8_t*& ptr) { |  | 
| 44     auto r = _mm_cvtsi32_si128(*(const uint16_t*)ptr); |  | 
| 45     ptr += 2; |  | 
| 46     const int _ = ~0; |  | 
| 47     return _mm_shuffle_epi8(r, _mm_setr_epi8(0,_,0,_,0,_,0,_, 1,_,1,_,1,_,1,_)); |  | 
| 48 } | 39 } | 
| 49 static inline m128ix2 next4(const uint8_t*& ptr) { | 40 static m64i next2(const uint32_t*& ptr) { | 
| 50     auto r = _mm_cvtsi32_si128(*(const uint32_t*)ptr); | 41     auto r = _mm_loadl_epi64((const __m128i*)ptr); | 
| 51     ptr += 4; |  | 
| 52     const int _ = ~0; |  | 
| 53     auto lo = _mm_shuffle_epi8(r, _mm_setr_epi8(0,_,0,_,0,_,0,_, 1,_,1,_,1,_,1,_
     )), |  | 
| 54          hi = _mm_shuffle_epi8(r, _mm_setr_epi8(2,_,2,_,2,_,2,_, 3,_,3,_,3,_,3,_
     )); |  | 
| 55     return { lo, hi }; |  | 
| 56 } |  | 
| 57 |  | 
| 58 static inline __m128i next1(const uint32_t*& ptr) { return unpacklo(_mm_cvtsi32_
     si128(*ptr++)); } |  | 
| 59 static inline __m128i next2(const uint32_t*& ptr) { |  | 
| 60     auto r = unpacklo(_mm_loadl_epi64((const __m128i*)ptr)); |  | 
| 61     ptr += 2; | 42     ptr += 2; | 
| 62     return r; | 43     return r; | 
| 63 } | 44 } | 
| 64 static inline m128ix2 next4(const uint32_t*& ptr) { | 45 static m64i next1(const uint32_t*& ptr) { | 
| 65     auto packed = _mm_loadu_si128((const __m128i*)ptr); | 46     auto r = _mm_cvtsi32_si128(*ptr); | 
| 66     ptr += 4; | 47     ptr += 1; | 
| 67     return { unpacklo(packed), unpackhi(packed) }; | 48     return r; | 
| 68 } | 49 } | 
| 69 | 50 | 
| 70 // Divide by 255 with rounding. | 51 // xyzw -> xxxx yyyy zzzz wwww | 
| 71 // (x+127)/255 == ((x+128)*257)>>16. | 52 static __m128i replicate_coverage(__m128i xyzw) { | 
| 72 // Sometimes we can be more efficient by breaking this into two parts. | 53     const uint8_t mask[] = { 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3 }; | 
| 73 static inline __m128i div255_part1(__m128i x) { return _mm_add_epi16(x, _mm_set1
     _epi16(128)); } | 54     return _mm_shuffle_epi8(xyzw, _mm_load_si128((const __m128i*)mask)); | 
| 74 static inline __m128i div255_part2(__m128i x) { return _mm_mulhi_epu16(x, _mm_se
     t1_epi16(257)); } |  | 
| 75 static inline __m128i div255(__m128i x) { return div255_part2(div255_part1(x)); 
     } |  | 
| 76 |  | 
| 77 // (x*y+127)/255, a byte multiply. |  | 
| 78 static inline __m128i scale(__m128i x, __m128i y) { |  | 
| 79     return div255(_mm_mullo_epi16(x, y)); |  | 
| 80 } | 55 } | 
| 81 | 56 | 
| 82 // (255 - x). | 57 static __m128i next4(const uint8_t*& ptr) { | 
| 83 static inline __m128i inv(__m128i x) { | 58     auto r = replicate_coverage(_mm_cvtsi32_si128(*(const uint32_t*)ptr)); | 
| 84     return _mm_xor_si128(_mm_set1_epi16(0x00ff), x);  // This seems a bit faster
      than _mm_sub_epi16. | 59     ptr += 4; | 
|  | 60     return r; | 
| 85 } | 61 } | 
| 86 | 62 static m64i next2(const uint8_t*& ptr) { | 
| 87 // ARGB argb -> AAAA aaaa | 63     auto r = replicate_coverage(_mm_cvtsi32_si128(*(const uint16_t*)ptr)); | 
| 88 static inline __m128i alphas(__m128i px) { | 64     ptr += 2; | 
| 89     const int a = 2 * (SK_A32_SHIFT/8);  // SK_A32_SHIFT is typically 24, so thi
     s is typically 6. | 65     return r; | 
| 90     const int _ = ~0; | 66 } | 
| 91     return _mm_shuffle_epi8(px, _mm_setr_epi8(a+0,_,a+0,_,a+0,_,a+0,_, a+8,_,a+8
     ,_,a+8,_,a+8,_)); | 67 static m64i next1(const uint8_t*& ptr) { | 
|  | 68     auto r = replicate_coverage(_mm_cvtsi32_si128(*ptr)); | 
|  | 69     ptr += 1; | 
|  | 70     return r; | 
| 92 } | 71 } | 
| 93 | 72 | 
| 94 // For i = 0...n, tgt = fn(dst,src,cov), where Dst,Src,and Cov can be constants 
     or arrays. | 73 // For i = 0...n, tgt = fn(dst,src,cov), where Dst,Src,and Cov can be constants 
     or arrays. | 
| 95 template <typename Dst, typename Src, typename Cov, typename Fn> | 74 template <typename Dst, typename Src, typename Cov, typename Fn> | 
| 96 static inline void loop(int n, uint32_t* t, const Dst dst, const Src src, const 
     Cov cov, Fn&& fn) { | 75 static void loop(int n, uint32_t* t, const Dst dst, const Src src, const Cov cov
     , Fn&& fn) { | 
| 97     // We don't want to muck with the callers' pointers, so we make them const a
     nd copy here. | 76     // We don't want to muck with the callers' pointers, so we make them const a
     nd copy here. | 
| 98     Dst d = dst; | 77     Dst d = dst; | 
| 99     Src s = src; | 78     Src s = src; | 
| 100     Cov c = cov; | 79     Cov c = cov; | 
| 101 | 80 | 
| 102     // Writing this as a single while-loop helps hoist loop invariants from fn. | 81     // Writing this as a single while-loop helps hoist loop invariants from fn. | 
| 103     while (n) { | 82     while (n) { | 
| 104         if (n >= 4) { | 83         if (n >= 4) { | 
| 105             auto d4 = next4(d), | 84             _mm_storeu_si128((__m128i*)t, fn(next4(d), next4(s), next4(c))); | 
| 106                  s4 = next4(s), |  | 
| 107                  c4 = next4(c); |  | 
| 108             auto lo = fn(d4.lo, s4.lo, c4.lo), |  | 
| 109                  hi = fn(d4.hi, s4.hi, c4.hi); |  | 
| 110             _mm_storeu_si128((__m128i*)t, pack(lo,hi)); |  | 
| 111             t += 4; | 85             t += 4; | 
| 112             n -= 4; | 86             n -= 4; | 
| 113             continue; | 87             continue; | 
| 114         } | 88         } | 
| 115         if (n & 2) { | 89         if (n & 2) { | 
| 116             auto r = fn(next2(d), next2(s), next2(c)); | 90             _mm_storel_epi64((__m128i*)t, fn(next2(d), next2(s), next2(c))); | 
| 117             _mm_storel_epi64((__m128i*)t, pack(r,r)); |  | 
| 118             t += 2; | 91             t += 2; | 
| 119         } | 92         } | 
| 120         if (n & 1) { | 93         if (n & 1) { | 
| 121             auto r = fn(next1(d), next1(s), next1(c)); | 94             *t = _mm_cvtsi128_si32(fn(next1(d), next1(s), next1(c))); | 
| 122             *t = _mm_cvtsi128_si32(pack(r,r)); |  | 
| 123         } | 95         } | 
| 124         return; | 96         return; | 
| 125     } | 97     } | 
| 126 } | 98 } | 
| 127 | 99 | 
| 128 namespace sk_sse41 { | 100 //                                             packed | 
|  | 101 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     ~~~~~~~~~~~~~~~~ // | 
|  | 102 //                                            unpacked | 
|  | 103 | 
|  | 104 // Everything on the packed side of the squiggly line deals with densely packed 
     8-bit data, | 
|  | 105 // e.g. [BGRA bgra ... ] for pixels or [ CCCC cccc ... ] for coverage. | 
|  | 106 // | 
|  | 107 // Everything on the unpacked side of the squiggly line deals with unpacked 8-bi
     t data, | 
|  | 108 // e.g [B_G_ R_A_ b_g_ r_a_ ] for pixels or [ C_C_ C_C_ c_c_ c_c_ c_c_ ] for cov
     erage, | 
|  | 109 // where _ is a zero byte. | 
|  | 110 // | 
|  | 111 // Adapt<Fn> / adapt(fn) allow the two sides to interoperate, | 
|  | 112 // by unpacking arguments, calling fn, then packing the results. | 
|  | 113 // | 
|  | 114 // This lets us write most of our code in terms of unpacked inputs (considerably
      simpler) | 
|  | 115 // and all the packing and unpacking is handled automatically. | 
|  | 116 | 
|  | 117 template <typename Fn> | 
|  | 118 struct Adapt { | 
|  | 119     Fn fn; | 
|  | 120 | 
|  | 121     __m128i operator()(__m128i d, __m128i s, __m128i c) { | 
|  | 122         auto lo = [](__m128i x) { return _mm_unpacklo_epi8(x, _mm_setzero_si128(
     )); }; | 
|  | 123         auto hi = [](__m128i x) { return _mm_unpackhi_epi8(x, _mm_setzero_si128(
     )); }; | 
|  | 124         return _mm_packus_epi16(fn(lo(d), lo(s), lo(c)), | 
|  | 125                                 fn(hi(d), hi(s), hi(c))); | 
|  | 126     } | 
|  | 127 | 
|  | 128     m64i operator()(const m64i& d, const m64i& s, const m64i& c) { | 
|  | 129         auto lo = [](__m128i x) { return _mm_unpacklo_epi8(x, _mm_setzero_si128(
     )); }; | 
|  | 130         auto r = fn(lo(d), lo(s), lo(c)); | 
|  | 131         return _mm_packus_epi16(r, r); | 
|  | 132     } | 
|  | 133 }; | 
|  | 134 | 
|  | 135 template <typename Fn> | 
|  | 136 static Adapt<Fn> adapt(Fn&& fn) { return { fn }; } | 
|  | 137 | 
|  | 138 // These helpers all work exclusively with unpacked 8-bit values, | 
|  | 139 // except div255() with is 16-bit -> unpacked 8-bit, and mul255() which is the r
     everse. | 
|  | 140 | 
|  | 141 // Divide by 255 with rounding. | 
|  | 142 // (x+127)/255 == ((x+128)*257)>>16. | 
|  | 143 // Sometimes we can be more efficient by breaking this into two parts. | 
|  | 144 static __m128i div255_part1(__m128i x) { return _mm_add_epi16(x, _mm_set1_epi16(
     128)); } | 
|  | 145 static __m128i div255_part2(__m128i x) { return _mm_mulhi_epu16(x, _mm_set1_epi1
     6(257)); } | 
|  | 146 static __m128i div255(__m128i x) { return div255_part2(div255_part1(x)); } | 
|  | 147 | 
|  | 148 // (x*y+127)/255, a byte multiply. | 
|  | 149 static __m128i scale(__m128i x, __m128i y) { return div255(_mm_mullo_epi16(x, y)
     ); } | 
|  | 150 | 
|  | 151 // (255 * x). | 
|  | 152 static __m128i mul255(__m128i x) { return _mm_sub_epi16(_mm_slli_epi16(x, 8), x)
     ; } | 
|  | 153 | 
|  | 154 // (255 - x). | 
|  | 155 static __m128i inv(__m128i x) { return _mm_xor_si128(_mm_set1_epi16(0x00ff), x);
      } | 
|  | 156 | 
|  | 157 // ARGB argb -> AAAA aaaa | 
|  | 158 static __m128i alphas(__m128i px) { | 
|  | 159     const int a = 2 * (SK_A32_SHIFT/8);  // SK_A32_SHIFT is typically 24, so thi
     s is typically 6. | 
|  | 160     const int _ = ~0; | 
|  | 161     return _mm_shuffle_epi8(px, _mm_setr_epi8(a+0,_,a+0,_,a+0,_,a+0,_, a+8,_,a+8
     ,_,a+8,_,a+8,_)); | 
|  | 162 } | 
| 129 | 163 | 
| 130 // SrcOver, with a constant source and full coverage. | 164 // SrcOver, with a constant source and full coverage. | 
| 131 static void blit_row_color32(SkPMColor* tgt, const SkPMColor* dst, int n, SkPMCo
     lor src) { | 165 static void blit_row_color32(SkPMColor* tgt, const SkPMColor* dst, int n, SkPMCo
     lor src) { | 
| 132     // We want to calculate s + (d * inv(alphas(s)) + 127)/255. | 166     // We want to calculate s + (d * inv(alphas(s)) + 127)/255. | 
| 133     // We'd generally do that div255 as s + ((d * inv(alphas(s)) + 128)*257)>>16
     . | 167     // We'd generally do that div255 as s + ((d * inv(alphas(s)) + 128)*257)>>16
     . | 
| 134 | 168 | 
| 135     // But we can go one step further to ((s*255 + 128 + d*inv(alphas(s)))*257)>
     >16. | 169     // But we can go one step further to ((s*255 + 128 + d*inv(alphas(s)))*257)>
     >16. | 
| 136     // This lets us hoist (s*255+128) and inv(alphas(s)) out of the loop. | 170     // This lets us hoist (s*255+128) and inv(alphas(s)) out of the loop. | 
| 137     __m128i s = next2(src), | 171     __m128i s = _mm_unpacklo_epi8(_mm_set1_epi32(src), _mm_setzero_si128()), | 
| 138             s_255_128 = div255_part1(_mm_mullo_epi16(s, _mm_set1_epi16(255))), | 172             s_255_128 = div255_part1(mul255(s)), | 
| 139             A = inv(alphas(s)); | 173             A = inv(alphas(s)); | 
| 140 | 174 | 
| 141     const uint8_t cov = 0xff; | 175     const uint8_t cov = 0xff; | 
| 142     loop(n, tgt, dst, src, cov, [=](__m128i d, __m128i, __m128i) { | 176     loop(n, tgt, dst, src, cov, adapt([=](__m128i d, __m128i, __m128i) { | 
| 143         return div255_part2(_mm_add_epi16(s_255_128, _mm_mullo_epi16(d, A))); | 177         return div255_part2(_mm_add_epi16(s_255_128, _mm_mullo_epi16(d, A))); | 
| 144     }); | 178     })); | 
| 145 } | 179 } | 
| 146 | 180 | 
| 147 // SrcOver, with a constant source and variable coverage. | 181 // SrcOver, with a constant source and variable coverage. | 
| 148 // If the source is opaque, SrcOver becomes Src. | 182 // If the source is opaque, SrcOver becomes Src. | 
| 149 static void blit_mask_d32_a8(SkPMColor* dst,     size_t dstRB, | 183 static void blit_mask_d32_a8(SkPMColor* dst,     size_t dstRB, | 
| 150                              const SkAlpha* cov, size_t covRB, | 184                              const SkAlpha* cov, size_t covRB, | 
| 151                              SkColor color, int w, int h) { | 185                              SkColor color, int w, int h) { | 
| 152     if (SkColorGetA(color) == 0xFF) { | 186     if (SkColorGetA(color) == 0xFF) { | 
| 153         const SkPMColor src = SkSwizzle_BGRA_to_PMColor(color); | 187         const SkPMColor src = SkSwizzle_BGRA_to_PMColor(color); | 
| 154         while (h --> 0) { | 188         while (h --> 0) { | 
| 155             loop(w, dst, (const SkPMColor*)dst, src, cov, [](__m128i d, __m128i 
     s, __m128i c) { | 189             loop(w, dst, (const SkPMColor*)dst, src, cov, | 
|  | 190                     adapt([](__m128i d, __m128i s, __m128i c) { | 
| 156                 // Src blend mode: a simple lerp from d to s by c. | 191                 // Src blend mode: a simple lerp from d to s by c. | 
| 157                 // TODO: try a pmaddubsw version? | 192                 // TODO: try a pmaddubsw version? | 
| 158                 return div255(_mm_add_epi16(_mm_mullo_epi16(inv(c),d), _mm_mullo
     _epi16(c,s))); | 193                 return div255(_mm_add_epi16(_mm_mullo_epi16(inv(c),d), | 
| 159             }); | 194                                             _mm_mullo_epi16(    c ,s))); | 
|  | 195             })); | 
| 160             dst += dstRB / sizeof(*dst); | 196             dst += dstRB / sizeof(*dst); | 
| 161             cov += covRB / sizeof(*cov); | 197             cov += covRB / sizeof(*cov); | 
| 162         } | 198         } | 
| 163     } else { | 199     } else { | 
| 164         const SkPMColor src = SkPreMultiplyColor(color); | 200         const SkPMColor src = SkPreMultiplyColor(color); | 
| 165         while (h --> 0) { | 201         while (h --> 0) { | 
| 166             loop(w, dst, (const SkPMColor*)dst, src, cov, [](__m128i d, __m128i 
     s, __m128i c) { | 202             loop(w, dst, (const SkPMColor*)dst, src, cov, | 
|  | 203                     adapt([](__m128i d, __m128i s, __m128i c) { | 
| 167                 // SrcOver blend mode, with coverage folded into source alpha. | 204                 // SrcOver blend mode, with coverage folded into source alpha. | 
| 168                 __m128i sc = scale(s,c), | 205                 __m128i sc = scale(s,c), | 
| 169                         AC = inv(alphas(sc)); | 206                         AC = inv(alphas(sc)); | 
| 170                 return _mm_add_epi16(sc, scale(d,AC)); | 207                 return _mm_add_epi16(sc, scale(d,AC)); | 
| 171             }); | 208             })); | 
| 172             dst += dstRB / sizeof(*dst); | 209             dst += dstRB / sizeof(*dst); | 
| 173             cov += covRB / sizeof(*cov); | 210             cov += covRB / sizeof(*cov); | 
| 174         } | 211         } | 
| 175     } | 212     } | 
| 176 } | 213 } | 
| 177 | 214 | 
| 178 }  // namespace sk_sse41 | 215 }  // namespace sk_sse41 | 
|  | 216 | 
| 179 #endif | 217 #endif | 
| 180 | 218 | 
| 181 namespace SkOpts { | 219 namespace SkOpts { | 
| 182     void Init_sse41() { | 220     void Init_sse41() { | 
| 183         box_blur_xx = sk_sse41::box_blur_xx; | 221         box_blur_xx = sk_sse41::box_blur_xx; | 
| 184         box_blur_xy = sk_sse41::box_blur_xy; | 222         box_blur_xy = sk_sse41::box_blur_xy; | 
| 185         box_blur_yx = sk_sse41::box_blur_yx; | 223         box_blur_yx = sk_sse41::box_blur_yx; | 
| 186 | 224 | 
| 187     #ifndef SK_SUPPORT_LEGACY_X86_BLITS | 225     #ifndef SK_SUPPORT_LEGACY_X86_BLITS | 
| 188         blit_row_color32 = sk_sse41::blit_row_color32; | 226         blit_row_color32 = sk_sse41::blit_row_color32; | 
| 189         blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8; | 227         blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8; | 
| 190     #endif | 228     #endif | 
| 191     } | 229     } | 
| 192 } | 230 } | 
| OLD | NEW | 
|---|