OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkBlitMask_opts_DEFINED | 8 #ifndef SkBlitMask_opts_DEFINED |
9 #define SkBlitMask_opts_DEFINED | 9 #define SkBlitMask_opts_DEFINED |
10 | 10 |
11 #include "Sk4px.h" | 11 #include "Sk4px.h" |
12 #include "SkPx.h" | |
13 | 12 |
14 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
15 | 14 |
16 template <typename Fn> | 15 #if defined(SK_ARM_HAS_NEON) |
17 static void blit_mask_d32_a8(const Fn& fn, SkPMColor* dst, size_t dstRB, | 16 // The Sk4px versions below will work fine with NEON, but we have had many i
ndications |
18 const SkAlpha* mask, size_t maskRB, | 17 // that it doesn't perform as well as this NEON-specific code. TODO(mtklein
): why? |
19 int w, int h) { | 18 #include "SkColor_opts_neon.h" |
20 while (h --> 0) { | 19 |
21 int n = w; | 20 template <bool isColor> |
22 while (n >= SkPx::N) { | 21 static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, |
23 fn(SkPx::Load(dst), SkPx::Alpha::Load(mask)).store(dst); | 22 const void* SK_RESTRICT maskPtr, size_t
maskRB, |
24 dst += SkPx::N; mask += SkPx::N; n -= SkPx::N; | 23 SkColor color, int width, int height) { |
| 24 SkPMColor pmc = SkPreMultiplyColor(color); |
| 25 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
| 26 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
| 27 uint8x8x4_t vpmc; |
| 28 |
| 29 maskRB -= width; |
| 30 dstRB -= (width << 2); |
| 31 |
| 32 if (width >= 8) { |
| 33 vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); |
| 34 vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); |
| 35 vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); |
| 36 vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); |
25 } | 37 } |
26 if (n > 0) { | 38 do { |
27 fn(SkPx::Load(dst, n), SkPx::Alpha::Load(mask, n)).store(dst, n); | 39 int w = width; |
28 dst += n; mask += n; | 40 while (w >= 8) { |
| 41 uint8x8_t vmask = vld1_u8(mask); |
| 42 uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); |
| 43 if (isColor) { |
| 44 vscale = vsubw_u8(vdupq_n_u16(256), |
| 45 SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); |
| 46 } else { |
| 47 vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
| 48 } |
| 49 uint8x8x4_t vdev = vld4_u8((uint8_t*)device); |
| 50 |
| 51 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256
) |
| 52 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); |
| 53 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256
) |
| 54 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); |
| 55 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256
) |
| 56 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); |
| 57 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256
) |
| 58 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); |
| 59 |
| 60 vst4_u8((uint8_t*)device, vdev); |
| 61 |
| 62 mask += 8; |
| 63 device += 8; |
| 64 w -= 8; |
| 65 } |
| 66 |
| 67 while (w--) { |
| 68 unsigned aa = *mask++; |
| 69 if (isColor) { |
| 70 *device = SkBlendARGB32(pmc, *device, aa); |
| 71 } else { |
| 72 *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) |
| 73 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
| 74 } |
| 75 device += 1; |
| 76 }; |
| 77 |
| 78 device = (uint32_t*)((char*)device + dstRB); |
| 79 mask += maskRB; |
| 80 |
| 81 } while (--height != 0); |
| 82 } |
| 83 |
| 84 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, |
| 85 const SkAlpha* mask, size_t maskRB, |
| 86 SkColor color, int w, int h) { |
| 87 D32_A8_Opaque_Color_neon<true>(dst, dstRB, mask, maskRB, color, w, h); |
| 88 } |
| 89 |
| 90 // As above, but made slightly simpler by requiring that color is opaque. |
| 91 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, |
| 92 const SkAlpha* mask, size_t maskRB, |
| 93 SkColor color, int w, int h) { |
| 94 D32_A8_Opaque_Color_neon<false>(dst, dstRB, mask, maskRB, color, w, h); |
| 95 } |
| 96 |
| 97 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and ev
en simpler case. |
| 98 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, |
| 99 const SkAlpha* maskPtr, size_t maskRB, |
| 100 int width, int height) { |
| 101 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
| 102 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
| 103 |
| 104 maskRB -= width; |
| 105 dstRB -= (width << 2); |
| 106 do { |
| 107 int w = width; |
| 108 while (w >= 8) { |
| 109 uint8x8_t vmask = vld1_u8(mask); |
| 110 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
| 111 uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); |
| 112 |
| 113 vdevice = SkAlphaMulQ_neon8(vdevice, vscale); |
| 114 vdevice.val[NEON_A] += vmask; |
| 115 |
| 116 vst4_u8((uint8_t*)device, vdevice); |
| 117 |
| 118 mask += 8; |
| 119 device += 8; |
| 120 w -= 8; |
| 121 } |
| 122 while (w-- > 0) { |
| 123 unsigned aa = *mask++; |
| 124 *device = (aa << SK_A32_SHIFT) |
| 125 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
| 126 device += 1; |
| 127 }; |
| 128 device = (uint32_t*)((char*)device + dstRB); |
| 129 mask += maskRB; |
| 130 } while (--height != 0); |
| 131 } |
| 132 |
| 133 #else |
| 134 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, |
| 135 const SkAlpha* mask, size_t maskRB, |
| 136 SkColor color, int w, int h) { |
| 137 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); |
| 138 auto fn = [&](const Sk4px& d, const Sk4px& aa) { |
| 139 // = (s + d(1-sa))aa + d(1-aa) |
| 140 // = s*aa + d(1-sa*aa) |
| 141 auto left = s.approxMulDiv255(aa), |
| 142 right = d.approxMulDiv255(left.alphas().inv()); |
| 143 return left + right; // This does not overflow (exhaustively checke
d). |
| 144 }; |
| 145 while (h --> 0) { |
| 146 Sk4px::MapDstAlpha(w, dst, mask, fn); |
| 147 dst += dstRB / sizeof(*dst); |
| 148 mask += maskRB / sizeof(*mask); |
29 } | 149 } |
30 dst += dstRB / sizeof(*dst) - w; | |
31 mask += maskRB / sizeof(*mask) - w; | |
32 } | 150 } |
33 } | |
34 | 151 |
35 static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, | 152 // As above, but made slightly simpler by requiring that color is opaque. |
36 const SkAlpha* mask, size_t maskRB, | 153 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, |
37 SkColor color, int w, int h) { | 154 const SkAlpha* mask, size_t maskRB, |
38 auto s = SkPx::Dup(SkPreMultiplyColor(color)); | 155 SkColor color, int w, int h) { |
39 | 156 SkASSERT(SkColorGetA(color) == 0xFF); |
40 if (color == SK_ColorBLACK) { | 157 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); |
41 auto fn = [](const SkPx& d, const SkPx::Alpha& aa) { | 158 auto fn = [&](const Sk4px& d, const Sk4px& aa) { |
42 // = (s + d(1-sa))aa + d(1-aa) | |
43 // = s*aa + d(1-sa*aa) | |
44 // ~~~> | |
45 // a = 1*aa + d(1-1*aa) = aa + d(1-aa) | |
46 // c = 0*aa + d(1-1*aa) = d(1-aa) | |
47 return d.approxMulDiv255(aa.inv()).addAlpha(aa); | |
48 }; | |
49 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); | |
50 } else if (SkColorGetA(color) == 0xFF) { | |
51 auto fn = [&](const SkPx& d, const SkPx::Alpha& aa) { | |
52 // = (s + d(1-sa))aa + d(1-aa) | 159 // = (s + d(1-sa))aa + d(1-aa) |
53 // = s*aa + d(1-sa*aa) | 160 // = s*aa + d(1-sa*aa) |
54 // ~~~> | 161 // ~~~> |
55 // = s*aa + d(1-aa) | 162 // = s*aa + d(1-aa) |
56 return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); | 163 return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); |
57 }; | 164 }; |
58 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); | 165 while (h --> 0) { |
| 166 Sk4px::MapDstAlpha(w, dst, mask, fn); |
| 167 dst += dstRB / sizeof(*dst); |
| 168 mask += maskRB / sizeof(*mask); |
| 169 } |
| 170 } |
| 171 |
| 172 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and ev
en simpler case. |
| 173 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, |
| 174 const SkAlpha* mask, size_t maskRB, |
| 175 int w, int h) { |
| 176 auto fn = [](const Sk4px& d, const Sk4px& aa) { |
| 177 // = (s + d(1-sa))aa + d(1-aa) |
| 178 // = s*aa + d(1-sa*aa) |
| 179 // ~~~> |
| 180 // a = 1*aa + d(1-1*aa) = aa + d(1-aa) |
| 181 // c = 0*aa + d(1-1*aa) = d(1-aa) |
| 182 return aa.zeroColors() + d.approxMulDiv255(aa.inv()); |
| 183 }; |
| 184 while (h --> 0) { |
| 185 Sk4px::MapDstAlpha(w, dst, mask, fn); |
| 186 dst += dstRB / sizeof(*dst); |
| 187 mask += maskRB / sizeof(*mask); |
| 188 } |
| 189 } |
| 190 #endif |
| 191 |
| 192 static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, |
| 193 const SkAlpha* mask, size_t maskRB, |
| 194 SkColor color, int w, int h) { |
| 195 if (color == SK_ColorBLACK) { |
| 196 blit_mask_d32_a8_black(dst, dstRB, mask, maskRB, w, h); |
| 197 } else if (SkColorGetA(color) == 0xFF) { |
| 198 blit_mask_d32_a8_opaque(dst, dstRB, mask, maskRB, color, w, h); |
59 } else { | 199 } else { |
60 auto fn = [&](const SkPx& d, const SkPx::Alpha& aa) { | 200 blit_mask_d32_a8_general(dst, dstRB, mask, maskRB, color, w, h); |
61 // = (s + d(1-sa))aa + d(1-aa) | |
62 // = s*aa + d(1-sa*aa) | |
63 auto left = s.approxMulDiv255(aa), | |
64 right = d.approxMulDiv255(left.alpha().inv()); | |
65 return left + right; // This does not overflow (exhaustively checke
d). | |
66 }; | |
67 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); | |
68 } | 201 } |
69 } | 202 } |
70 | 203 |
71 } // SK_OPTS_NS | 204 } // SK_OPTS_NS |
72 | 205 |
73 #endif//SkBlitMask_opts_DEFINED | 206 #endif//SkBlitMask_opts_DEFINED |
OLD | NEW |