OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkBlitRow_opts_DEFINED | 8 #ifndef SkBlitRow_opts_DEFINED |
9 #define SkBlitRow_opts_DEFINED | 9 #define SkBlitRow_opts_DEFINED |
10 | 10 |
11 #include "Sk4px.h" | 11 #include "Sk4px.h" |
| 12 #include "SkColorPriv.h" |
| 13 #include "SkMSAN.h" |
| 14 |
| 15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 16 #include "SkColor_opts_SSE2.h" |
| 17 #endif |
12 | 18 |
13 namespace SK_OPTS_NS { | 19 namespace SK_OPTS_NS { |
14 | 20 |
15 // Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp. | 21 // Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp. |
16 // It's not quite perfect, but it's never wrong in the interesting edge cases, | 22 // It's not quite perfect, but it's never wrong in the interesting edge cases, |
17 // and it's quite a bit faster than blend_perfect. | 23 // and it's quite a bit faster than blend_perfect. |
18 // | 24 // |
19 // blend_256_round_alt is our currently blessed algorithm. Please use it or an
analogous one. | 25 // blend_256_round_alt is our currently blessed algorithm. Please use it or an
analogous one. |
20 static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, Sk
PMColor color) { | 26 static inline |
| 27 void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor
color) { |
21 unsigned invA = 255 - SkGetPackedA32(color); | 28 unsigned invA = 255 - SkGetPackedA32(color); |
22 invA += invA >> 7; | 29 invA += invA >> 7; |
23 SkASSERT(invA < 256); // We've should have already handled alpha == 0 exter
nally. | 30 SkASSERT(invA < 256); // We've should have already handled alpha == 0 exter
nally. |
24 | 31 |
25 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); | 32 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); |
26 Sk16b invA_16x(invA); | 33 Sk16b invA_16x(invA); |
27 | 34 |
28 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { | 35 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { |
29 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); | 36 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); |
30 }); | 37 }); |
31 } | 38 } |
32 | 39 |
| 40 static inline |
| 41 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU a
lpha) { |
| 42 SkASSERT(alpha == 0xFF); |
| 43 sk_msan_assert_initialized(src, src+len); |
| 44 |
| 45 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 46 while (len >= 16) { |
| 47 // Load 16 source pixels. |
| 48 auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0), |
| 49 s1 = _mm_loadu_si128((const __m128i*)(src) + 1), |
| 50 s2 = _mm_loadu_si128((const __m128i*)(src) + 2), |
| 51 s3 = _mm_loadu_si128((const __m128i*)(src) + 3); |
| 52 |
| 53 const auto alphaMask = _mm_set1_epi32(0xFF000000); |
| 54 |
| 55 auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); |
| 56 if (_mm_testz_si128(ORed, alphaMask)) { |
| 57 // All 16 source pixels are transparent. Nothing to do. |
| 58 src += 16; |
| 59 dst += 16; |
| 60 len -= 16; |
| 61 continue; |
| 62 } |
| 63 |
| 64 auto d0 = (__m128i*)(dst) + 0, |
| 65 d1 = (__m128i*)(dst) + 1, |
| 66 d2 = (__m128i*)(dst) + 2, |
| 67 d3 = (__m128i*)(dst) + 3; |
| 68 |
| 69 auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)))
; |
| 70 if (_mm_testc_si128(ANDed, alphaMask)) { |
| 71 // All 16 source pixels are opaque. SrcOver becomes Src. |
| 72 _mm_storeu_si128(d0, s0); |
| 73 _mm_storeu_si128(d1, s1); |
| 74 _mm_storeu_si128(d2, s2); |
| 75 _mm_storeu_si128(d3, s3); |
| 76 src += 16; |
| 77 dst += 16; |
| 78 len -= 16; |
| 79 continue; |
| 80 } |
| 81 |
| 82 // TODO: This math is wrong. |
| 83 // Do SrcOver. |
| 84 _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0))); |
| 85 _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1))); |
| 86 _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2))); |
| 87 _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3))); |
| 88 src += 16; |
| 89 dst += 16; |
| 90 len -= 16; |
| 91 } |
| 92 |
| 93 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 94 while (len >= 16) { |
| 95 // Load 16 source pixels. |
| 96 auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0), |
| 97 s1 = _mm_loadu_si128((const __m128i*)(src) + 1), |
| 98 s2 = _mm_loadu_si128((const __m128i*)(src) + 2), |
| 99 s3 = _mm_loadu_si128((const __m128i*)(src) + 3); |
| 100 |
| 101 const auto alphaMask = _mm_set1_epi32(0xFF000000); |
| 102 |
| 103 auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); |
| 104 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alpha
Mask), |
| 105 _mm_setzero_si128()))) { |
| 106 // All 16 source pixels are transparent. Nothing to do. |
| 107 src += 16; |
| 108 dst += 16; |
| 109 len -= 16; |
| 110 continue; |
| 111 } |
| 112 |
| 113 auto d0 = (__m128i*)(dst) + 0, |
| 114 d1 = (__m128i*)(dst) + 1, |
| 115 d2 = (__m128i*)(dst) + 2, |
| 116 d3 = (__m128i*)(dst) + 3; |
| 117 |
| 118 auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)))
; |
| 119 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alph
aMask), |
| 120 alphaMask))) { |
| 121 // All 16 source pixels are opaque. SrcOver becomes Src. |
| 122 _mm_storeu_si128(d0, s0); |
| 123 _mm_storeu_si128(d1, s1); |
| 124 _mm_storeu_si128(d2, s2); |
| 125 _mm_storeu_si128(d3, s3); |
| 126 src += 16; |
| 127 dst += 16; |
| 128 len -= 16; |
| 129 continue; |
| 130 } |
| 131 |
| 132 // TODO: This math is wrong. |
| 133 // Do SrcOver. |
| 134 _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0))); |
| 135 _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1))); |
| 136 _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2))); |
| 137 _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3))); |
| 138 |
| 139 src += 16; |
| 140 dst += 16; |
| 141 len -= 16; |
| 142 } |
| 143 |
| 144 #elif defined(SK_ARM_HAS_NEON) |
| 145 while (len >= 4) { |
| 146 if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) { |
| 147 // All 16 source pixels are transparent. Nothing to do. |
| 148 src += 4; |
| 149 dst += 4; |
| 150 len -= 4; |
| 151 continue; |
| 152 } |
| 153 |
| 154 if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) { |
| 155 // All 16 source pixels are opaque. SrcOver becomes Src. |
| 156 dst[0] = src[0]; |
| 157 dst[1] = src[1]; |
| 158 dst[2] = src[2]; |
| 159 dst[3] = src[3]; |
| 160 src += 4; |
| 161 dst += 4; |
| 162 len -= 4; |
| 163 continue; |
| 164 } |
| 165 |
| 166 // Load 4 source and destination pixels. |
| 167 auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)), |
| 168 src2 = vreinterpret_u8_u32(vld1_u32(src+2)), |
| 169 dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)), |
| 170 dst2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
| 171 |
| 172 // TODO: This math is wrong. |
| 173 const uint8x8_t alphas = vcreate_u8(0x0707070703030303); |
| 174 auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)), |
| 175 invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas)); |
| 176 |
| 177 auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)), |
| 178 dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2)); |
| 179 |
| 180 dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8)); |
| 181 dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8)); |
| 182 |
| 183 vst1_u32(dst+0, vreinterpret_u32_u8(dst0)); |
| 184 vst1_u32(dst+2, vreinterpret_u32_u8(dst2)); |
| 185 |
| 186 src += 4; |
| 187 dst += 4; |
| 188 len -= 4; |
| 189 } |
| 190 #endif |
| 191 |
| 192 while (len-- > 0) { |
| 193 if (*src) { |
| 194 *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst); |
| 195 } |
| 196 src++; |
| 197 dst++; |
| 198 } |
| 199 } |
| 200 |
33 } // SK_OPTS_NS | 201 } // SK_OPTS_NS |
34 | 202 |
35 #endif//SkBlitRow_opts_DEFINED | 203 #endif//SkBlitRow_opts_DEFINED |
OLD | NEW |