OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkBlitMask_opts_DEFINED | 8 #ifndef SkBlitMask_opts_DEFINED |
9 #define SkBlitMask_opts_DEFINED | 9 #define SkBlitMask_opts_DEFINED |
10 | 10 |
11 #include "Sk4px.h" | 11 #include "Sk4px.h" |
| 12 #include "SkPx.h" |
12 | 13 |
13 namespace SK_OPTS_NS { | 14 namespace SK_OPTS_NS { |
14 | 15 |
15 #if defined(SK_ARM_HAS_NEON) | 16 template <typename Fn> |
16 // The Sk4px versions below will work fine with NEON, but we have had many i
ndications | 17 static void blit_mask_d32_a8(const Fn& fn, SkPMColor* dst, size_t dstRB, |
17 // that it doesn't perform as well as this NEON-specific code. TODO(mtklein
): why? | 18 const SkAlpha* mask, size_t maskRB, |
18 #include "SkColor_opts_neon.h" | 19 int w, int h) { |
| 20 while (h --> 0) { |
| 21 int n = w; |
| 22 while (n >= SkPx::N) { |
| 23 fn(SkPx::Load(dst), SkPx::Alpha::Load(mask)).store(dst); |
| 24 dst += SkPx::N; mask += SkPx::N; n -= SkPx::N; |
| 25 } |
| 26 if (n > 0) { |
| 27 fn(SkPx::Load(dst, n), SkPx::Alpha::Load(mask, n)).store(dst, n); |
| 28 dst += n; mask += n; |
| 29 } |
| 30 dst += dstRB / sizeof(*dst) - w; |
| 31 mask += maskRB / sizeof(*mask) - w; |
| 32 } |
| 33 } |
19 | 34 |
20 template <bool isColor> | 35 static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, |
21 static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, | 36 const SkAlpha* mask, size_t maskRB, |
22 const void* SK_RESTRICT maskPtr, size_t
maskRB, | 37 SkColor color, int w, int h) { |
23 SkColor color, int width, int height) { | 38 auto s = SkPx::Dup(SkPreMultiplyColor(color)); |
24 SkPMColor pmc = SkPreMultiplyColor(color); | |
25 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; | |
26 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; | |
27 uint8x8x4_t vpmc; | |
28 | 39 |
29 maskRB -= width; | 40 if (color == SK_ColorBLACK) { |
30 dstRB -= (width << 2); | 41 auto fn = [](const SkPx& d, const SkPx::Alpha& aa) { |
31 | 42 // = (s + d(1-sa))aa + d(1-aa) |
32 if (width >= 8) { | 43 // = s*aa + d(1-sa*aa) |
33 vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); | 44 // ~~~> |
34 vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); | 45 // a = 1*aa + d(1-1*aa) = aa + d(1-aa) |
35 vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); | 46 // c = 0*aa + d(1-1*aa) = d(1-aa) |
36 vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); | 47 return d.approxMulDiv255(aa.inv()).addAlpha(aa); |
37 } | |
38 do { | |
39 int w = width; | |
40 while (w >= 8) { | |
41 uint8x8_t vmask = vld1_u8(mask); | |
42 uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); | |
43 if (isColor) { | |
44 vscale = vsubw_u8(vdupq_n_u16(256), | |
45 SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); | |
46 } else { | |
47 vscale = vsubw_u8(vdupq_n_u16(256), vmask); | |
48 } | |
49 uint8x8x4_t vdev = vld4_u8((uint8_t*)device); | |
50 | |
51 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256
) | |
52 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); | |
53 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256
) | |
54 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); | |
55 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256
) | |
56 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); | |
57 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256
) | |
58 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); | |
59 | |
60 vst4_u8((uint8_t*)device, vdev); | |
61 | |
62 mask += 8; | |
63 device += 8; | |
64 w -= 8; | |
65 } | |
66 | |
67 while (w--) { | |
68 unsigned aa = *mask++; | |
69 if (isColor) { | |
70 *device = SkBlendARGB32(pmc, *device, aa); | |
71 } else { | |
72 *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) | |
73 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); | |
74 } | |
75 device += 1; | |
76 }; | |
77 | |
78 device = (uint32_t*)((char*)device + dstRB); | |
79 mask += maskRB; | |
80 | |
81 } while (--height != 0); | |
82 } | |
83 | |
84 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, | |
85 const SkAlpha* mask, size_t maskRB, | |
86 SkColor color, int w, int h) { | |
87 D32_A8_Opaque_Color_neon<true>(dst, dstRB, mask, maskRB, color, w, h); | |
88 } | |
89 | |
90 // As above, but made slightly simpler by requiring that color is opaque. | |
91 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, | |
92 const SkAlpha* mask, size_t maskRB, | |
93 SkColor color, int w, int h) { | |
94 D32_A8_Opaque_Color_neon<false>(dst, dstRB, mask, maskRB, color, w, h); | |
95 } | |
96 | |
97 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and ev
en simpler case. | |
98 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, | |
99 const SkAlpha* maskPtr, size_t maskRB, | |
100 int width, int height) { | |
101 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; | |
102 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; | |
103 | |
104 maskRB -= width; | |
105 dstRB -= (width << 2); | |
106 do { | |
107 int w = width; | |
108 while (w >= 8) { | |
109 uint8x8_t vmask = vld1_u8(mask); | |
110 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); | |
111 uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); | |
112 | |
113 vdevice = SkAlphaMulQ_neon8(vdevice, vscale); | |
114 vdevice.val[NEON_A] += vmask; | |
115 | |
116 vst4_u8((uint8_t*)device, vdevice); | |
117 | |
118 mask += 8; | |
119 device += 8; | |
120 w -= 8; | |
121 } | |
122 while (w-- > 0) { | |
123 unsigned aa = *mask++; | |
124 *device = (aa << SK_A32_SHIFT) | |
125 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); | |
126 device += 1; | |
127 }; | |
128 device = (uint32_t*)((char*)device + dstRB); | |
129 mask += maskRB; | |
130 } while (--height != 0); | |
131 } | |
132 | |
133 #else | |
134 static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, | |
135 const SkAlpha* mask, size_t maskRB, | |
136 SkColor color, int w, int h) { | |
137 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); | |
138 auto fn = [&](const Sk4px& d, const Sk4px& aa) { | |
139 // = (s + d(1-sa))aa + d(1-aa) | |
140 // = s*aa + d(1-sa*aa) | |
141 auto left = s.approxMulDiv255(aa), | |
142 right = d.approxMulDiv255(left.alphas().inv()); | |
143 return left + right; // This does not overflow (exhaustively checke
d). | |
144 }; | 48 }; |
145 while (h --> 0) { | 49 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); |
146 Sk4px::MapDstAlpha(w, dst, mask, fn); | 50 } else if (SkColorGetA(color) == 0xFF) { |
147 dst += dstRB / sizeof(*dst); | 51 auto fn = [&](const SkPx& d, const SkPx::Alpha& aa) { |
148 mask += maskRB / sizeof(*mask); | |
149 } | |
150 } | |
151 | |
152 // As above, but made slightly simpler by requiring that color is opaque. | |
153 static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, | |
154 const SkAlpha* mask, size_t maskRB, | |
155 SkColor color, int w, int h) { | |
156 SkASSERT(SkColorGetA(color) == 0xFF); | |
157 auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); | |
158 auto fn = [&](const Sk4px& d, const Sk4px& aa) { | |
159 // = (s + d(1-sa))aa + d(1-aa) | 52 // = (s + d(1-sa))aa + d(1-aa) |
160 // = s*aa + d(1-sa*aa) | 53 // = s*aa + d(1-sa*aa) |
161 // ~~~> | 54 // ~~~> |
162 // = s*aa + d(1-aa) | 55 // = s*aa + d(1-aa) |
163 return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); | 56 return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); |
164 }; | 57 }; |
165 while (h --> 0) { | 58 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); |
166 Sk4px::MapDstAlpha(w, dst, mask, fn); | 59 } else { |
167 dst += dstRB / sizeof(*dst); | 60 auto fn = [&](const SkPx& d, const SkPx::Alpha& aa) { |
168 mask += maskRB / sizeof(*mask); | 61 // = (s + d(1-sa))aa + d(1-aa) |
169 } | 62 // = s*aa + d(1-sa*aa) |
170 } | 63 auto left = s.approxMulDiv255(aa), |
171 | 64 right = d.approxMulDiv255(left.alpha().inv()); |
172 // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and ev
en simpler case. | 65 return left + right; // This does not overflow (exhaustively checke
d). |
173 static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, | |
174 const SkAlpha* mask, size_t maskRB, | |
175 int w, int h) { | |
176 auto fn = [](const Sk4px& d, const Sk4px& aa) { | |
177 // = (s + d(1-sa))aa + d(1-aa) | |
178 // = s*aa + d(1-sa*aa) | |
179 // ~~~> | |
180 // a = 1*aa + d(1-1*aa) = aa + d(1-aa) | |
181 // c = 0*aa + d(1-1*aa) = d(1-aa) | |
182 return aa.zeroColors() + d.approxMulDiv255(aa.inv()); | |
183 }; | 66 }; |
184 while (h --> 0) { | 67 blit_mask_d32_a8(fn, dst, dstRB, mask, maskRB, w, h); |
185 Sk4px::MapDstAlpha(w, dst, mask, fn); | |
186 dst += dstRB / sizeof(*dst); | |
187 mask += maskRB / sizeof(*mask); | |
188 } | |
189 } | |
190 #endif | |
191 | |
192 static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, | |
193 const SkAlpha* mask, size_t maskRB, | |
194 SkColor color, int w, int h) { | |
195 if (color == SK_ColorBLACK) { | |
196 blit_mask_d32_a8_black(dst, dstRB, mask, maskRB, w, h); | |
197 } else if (SkColorGetA(color) == 0xFF) { | |
198 blit_mask_d32_a8_opaque(dst, dstRB, mask, maskRB, color, w, h); | |
199 } else { | |
200 blit_mask_d32_a8_general(dst, dstRB, mask, maskRB, color, w, h); | |
201 } | 68 } |
202 } | 69 } |
203 | 70 |
204 } // SK_OPTS_NS | 71 } // SK_OPTS_NS |
205 | 72 |
206 #endif//SkBlitMask_opts_DEFINED | 73 #endif//SkBlitMask_opts_DEFINED |
OLD | NEW |