OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkXfermode.h" | 8 // Including Sk4pxXfermode.h from this file should find SK_ARM_HAS_NEON is defin
ed. |
9 #include "SkXfermode_proccoeff.h" | |
10 #include "SkColorPriv.h" | |
11 | |
12 #include <arm_neon.h> | |
13 #include "SkColor_opts_neon.h" | |
14 #include "SkXfermode_opts_arm_neon.h" | |
15 #include "Sk4pxXfermode.h" | 9 #include "Sk4pxXfermode.h" |
16 | 10 |
17 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 11 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& r, SkX
fermode::Mode m); |
18 | 12 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& r, SkX
fermode::Mode m) { |
19 | 13 return SkCreate4pxXfermode(r, m); |
20 //////////////////////////////////////////////////////////////////////////////// | |
21 // NEONized skia functions | |
22 //////////////////////////////////////////////////////////////////////////////// | |
23 | |
24 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha)
{ | |
25 uint16x8_t tmp; | |
26 uint8x8_t ret; | |
27 | |
28 tmp = vmull_u8(color, alpha); | |
29 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); | |
30 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); | |
31 | |
32 ret = vshrn_n_u16(tmp, 8); | |
33 | |
34 return ret; | |
35 } | 14 } |
36 | |
37 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
ha) { | |
38 uint16x8_t ret; | |
39 | |
40 ret = vmull_u8(color, alpha); | |
41 ret = vaddq_u16(ret, vdupq_n_u16(128)); | |
42 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | |
43 | |
44 ret = vshrq_n_u16(ret, 8); | |
45 | |
46 return ret; | |
47 } | |
48 | |
49 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | |
50 uint16x8_t tmp; | |
51 | |
52 #ifdef SK_CPU_ARM64 | |
53 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
54 vreinterpretq_u32_s32(p2)); | |
55 #else | |
56 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
57 vmovn_u32(vreinterpretq_u32_s32(p2))); | |
58 #endif | |
59 | |
60 tmp += vdupq_n_u16(128); | |
61 tmp += vshrq_n_u16(tmp, 8); | |
62 | |
63 return vshrn_n_u16(tmp, 8); | |
64 } | |
65 | |
66 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | |
67 prod += vdupq_n_u16(128); | |
68 prod += vshrq_n_u16(prod, 8); | |
69 | |
70 return vshrq_n_u16(prod, 8); | |
71 } | |
72 | |
73 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | |
74 uint8x8_t ret; | |
75 uint32x4_t cmp1, cmp2; | |
76 uint16x8_t cmp16; | |
77 uint8x8_t cmp8, cmp8_1; | |
78 | |
79 // Test if <= 0 | |
80 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | |
81 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | |
82 #ifdef SK_CPU_ARM64 | |
83 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
84 #else | |
85 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
86 #endif | |
87 cmp8_1 = vmovn_u16(cmp16); | |
88 | |
89 // Init to zero | |
90 ret = vdup_n_u8(0); | |
91 | |
92 // Test if >= 255*255 | |
93 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | |
94 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | |
95 #ifdef SK_CPU_ARM64 | |
96 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
97 #else | |
98 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
99 #endif | |
100 cmp8 = vmovn_u16(cmp16); | |
101 | |
102 // Insert 255 where true | |
103 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | |
104 | |
105 // Calc SkDiv255Round | |
106 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | |
107 | |
108 // Insert where false and previous test false | |
109 cmp8 = cmp8 | cmp8_1; | |
110 ret = vbsl_u8(cmp8, ret, div); | |
111 | |
112 // Return the final combination | |
113 return ret; | |
114 } | |
115 | |
116 //////////////////////////////////////////////////////////////////////////////// | |
117 // 1 pixel modeprocs | |
118 //////////////////////////////////////////////////////////////////////////////// | |
119 | |
120 // kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] | |
121 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
122 unsigned sa = SkGetPackedA32(src); | |
123 unsigned da = SkGetPackedA32(dst); | |
124 unsigned isa = 255 - sa; | |
125 | |
126 uint8x8_t vda, visa, vsrc, vdst; | |
127 | |
128 vda = vdup_n_u8(da); | |
129 visa = vdup_n_u8(isa); | |
130 | |
131 uint16x8_t vsrc_wide, vdst_wide; | |
132 vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); | |
133 vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); | |
134 | |
135 vsrc_wide += vdupq_n_u16(128); | |
136 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
137 | |
138 vdst_wide += vdupq_n_u16(128); | |
139 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
140 | |
141 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
142 vdst = vshrn_n_u16(vdst_wide, 8); | |
143 | |
144 vsrc += vdst; | |
145 vsrc = vset_lane_u8(da, vsrc, 3); | |
146 | |
147 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
148 } | |
149 | |
150 // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] | |
151 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
152 unsigned sa = SkGetPackedA32(src); | |
153 unsigned da = SkGetPackedA32(dst); | |
154 unsigned ida = 255 - da; | |
155 | |
156 uint8x8_t vsa, vida, vsrc, vdst; | |
157 | |
158 vsa = vdup_n_u8(sa); | |
159 vida = vdup_n_u8(ida); | |
160 | |
161 uint16x8_t vsrc_wide, vdst_wide; | |
162 vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); | |
163 vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); | |
164 | |
165 vsrc_wide += vdupq_n_u16(128); | |
166 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
167 | |
168 vdst_wide += vdupq_n_u16(128); | |
169 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
170 | |
171 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
172 vdst = vshrn_n_u16(vdst_wide, 8); | |
173 | |
174 vsrc += vdst; | |
175 vsrc = vset_lane_u8(sa, vsrc, 3); | |
176 | |
177 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
178 } | |
179 | |
180 // kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] | |
181 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
182 unsigned sa = SkGetPackedA32(src); | |
183 unsigned da = SkGetPackedA32(dst); | |
184 unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); | |
185 unsigned isa = 255 - sa; | |
186 unsigned ida = 255 - da; | |
187 | |
188 uint8x8_t vsrc, vdst, visa, vida; | |
189 uint16x8_t vsrc_wide, vdst_wide; | |
190 | |
191 visa = vdup_n_u8(isa); | |
192 vida = vdup_n_u8(ida); | |
193 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
194 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
195 | |
196 vsrc_wide = vmull_u8(vsrc, vida); | |
197 vdst_wide = vmull_u8(vdst, visa); | |
198 | |
199 vsrc_wide += vdupq_n_u16(128); | |
200 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
201 | |
202 vdst_wide += vdupq_n_u16(128); | |
203 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
204 | |
205 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
206 vdst = vshrn_n_u16(vdst_wide, 8); | |
207 | |
208 vsrc += vdst; | |
209 | |
210 vsrc = vset_lane_u8(ret_alpha, vsrc, 3); | |
211 | |
212 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
213 } | |
214 | |
215 // kPlus_Mode | |
216 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
217 uint8x8_t vsrc, vdst; | |
218 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
219 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
220 vsrc = vqadd_u8(vsrc, vdst); | |
221 | |
222 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
223 } | |
224 | |
225 // kModulate_Mode | |
226 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
227 uint8x8_t vsrc, vdst, vres; | |
228 uint16x8_t vres_wide; | |
229 | |
230 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
231 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
232 | |
233 vres_wide = vmull_u8(vsrc, vdst); | |
234 | |
235 vres_wide += vdupq_n_u16(128); | |
236 vres_wide += vshrq_n_u16(vres_wide, 8); | |
237 | |
238 vres = vshrn_n_u16(vres_wide, 8); | |
239 | |
240 return vget_lane_u32(vreinterpret_u32_u8(vres), 0); | |
241 } | |
242 | |
243 //////////////////////////////////////////////////////////////////////////////// | |
244 // 8 pixels modeprocs | |
245 //////////////////////////////////////////////////////////////////////////////// | |
246 | |
247 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
248 uint8x8x4_t ret; | |
249 uint16x8_t src_scale; | |
250 | |
251 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
252 | |
253 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_sc
ale); | |
254 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_sc
ale); | |
255 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_sc
ale); | |
256 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_sc
ale); | |
257 | |
258 return ret; | |
259 } | |
260 | |
261 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
262 uint8x8x4_t ret; | |
263 uint16x8_t scale; | |
264 | |
265 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); | |
266 | |
267 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); | |
268 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); | |
269 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); | |
270 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); | |
271 | |
272 return ret; | |
273 } | |
274 | |
275 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
276 uint8x8x4_t ret; | |
277 uint16x8_t scale; | |
278 | |
279 scale = SkAlpha255To256_neon8(src.val[NEON_A]); | |
280 | |
281 ret = SkAlphaMulQ_neon8(dst, scale); | |
282 | |
283 return ret; | |
284 } | |
285 | |
286 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
287 uint8x8x4_t ret; | |
288 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
289 | |
290 ret = SkAlphaMulQ_neon8(src, scale); | |
291 | |
292 return ret; | |
293 } | |
294 | |
295 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
296 uint8x8x4_t ret; | |
297 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); | |
298 | |
299 ret = SkAlphaMulQ_neon8(dst, scale); | |
300 | |
301 return ret; | |
302 } | |
303 | |
304 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
305 uint8x8x4_t ret; | |
306 uint8x8_t isa; | |
307 | |
308 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
309 | |
310 ret.val[NEON_A] = dst.val[NEON_A]; | |
311 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) | |
312 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
313 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) | |
314 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
315 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) | |
316 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
317 | |
318 return ret; | |
319 } | |
320 | |
321 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
322 uint8x8x4_t ret; | |
323 uint8x8_t ida; | |
324 | |
325 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
326 | |
327 ret.val[NEON_A] = src.val[NEON_A]; | |
328 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
329 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); | |
330 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
331 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); | |
332 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
333 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); | |
334 | |
335 return ret; | |
336 } | |
337 | |
338 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
339 uint8x8x4_t ret; | |
340 uint8x8_t isa, ida; | |
341 uint16x8_t tmp_wide, tmp_wide2; | |
342 | |
343 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
344 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
345 | |
346 // First calc alpha | |
347 tmp_wide = vmovl_u8(src.val[NEON_A]); | |
348 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); | |
349 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A
]), 1); | |
350 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); | |
351 ret.val[NEON_A] = vmovn_u16(tmp_wide); | |
352 | |
353 // Then colors | |
354 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
355 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
356 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
357 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
358 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
359 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
360 | |
361 return ret; | |
362 } | |
363 | |
364 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
365 uint8x8x4_t ret; | |
366 | |
367 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); | |
368 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); | |
369 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); | |
370 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); | |
371 | |
372 return ret; | |
373 } | |
374 | |
375 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
376 uint8x8x4_t ret; | |
377 | |
378 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); | |
379 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); | |
380 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); | |
381 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); | |
382 | |
383 return ret; | |
384 } | |
385 | |
386 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { | |
387 uint16x8_t tmp; | |
388 | |
389 tmp = vaddl_u8(a, b); | |
390 tmp -= SkAlphaMulAlpha_neon8_16(a, b); | |
391 | |
392 return vmovn_u16(tmp); | |
393 } | |
394 | |
395 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
396 uint8x8x4_t ret; | |
397 | |
398 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
399 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); | |
400 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); | |
401 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); | |
402 | |
403 return ret; | |
404 } | |
405 | |
406 template <bool overlay> | |
407 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
408 uint8x8_t sa, uint8x8_t da) { | |
409 /* | |
410 * In the end we're gonna use (rc + tmp) with a different rc | |
411 * coming from an alternative. | |
412 * The whole value (rc + tmp) can always be expressed as | |
413 * VAL = COM - SUB in the if case | |
414 * VAL = COM + SUB - sa*da in the else case | |
415 * | |
416 * with COM = 255 * (sc + dc) | |
417 * and SUB = sc*da + dc*sa - 2*dc*sc | |
418 */ | |
419 | |
420 // Prepare common subexpressions | |
421 uint16x8_t const255 = vdupq_n_u16(255); | |
422 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); | |
423 uint16x8_t scda = vmull_u8(sc, da); | |
424 uint16x8_t dcsa = vmull_u8(dc, sa); | |
425 uint16x8_t sada = vmull_u8(sa, da); | |
426 | |
427 // Prepare non common subexpressions | |
428 uint16x8_t dc2, sc2; | |
429 uint32x4_t scdc2_1, scdc2_2; | |
430 if (overlay) { | |
431 dc2 = vshll_n_u8(dc, 1); | |
432 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | |
433 #ifdef SK_CPU_ARM64 | |
434 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); | |
435 #else | |
436 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | |
437 #endif | |
438 } else { | |
439 sc2 = vshll_n_u8(sc, 1); | |
440 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | |
441 #ifdef SK_CPU_ARM64 | |
442 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); | |
443 #else | |
444 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | |
445 #endif | |
446 } | |
447 | |
448 // Calc COM | |
449 int32x4_t com1, com2; | |
450 com1 = vreinterpretq_s32_u32( | |
451 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
452 com2 = vreinterpretq_s32_u32( | |
453 #ifdef SK_CPU_ARM64 | |
454 vmull_high_u16(const255, sc_plus_dc)); | |
455 #else | |
456 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
457 #endif | |
458 | |
459 // Calc SUB | |
460 int32x4_t sub1, sub2; | |
461 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | |
462 #ifdef SK_CPU_ARM64 | |
463 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); | |
464 #else | |
465 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | |
466 #endif | |
467 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | |
468 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | |
469 | |
470 // Compare 2*dc <= da | |
471 uint16x8_t cmp; | |
472 | |
473 if (overlay) { | |
474 cmp = vcleq_u16(dc2, vmovl_u8(da)); | |
475 } else { | |
476 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | |
477 } | |
478 | |
479 // Prepare variables | |
480 int32x4_t val1_1, val1_2; | |
481 int32x4_t val2_1, val2_2; | |
482 uint32x4_t cmp1, cmp2; | |
483 | |
484 // Doing a signed lengthening allows to save a few instructions | |
485 // thanks to sign extension. | |
486 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); | |
487 #ifdef SK_CPU_ARM64 | |
488 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); | |
489 #else | |
490 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); | |
491 #endif | |
492 | |
493 // Calc COM - SUB | |
494 val1_1 = com1 - sub1; | |
495 val1_2 = com2 - sub2; | |
496 | |
497 // Calc COM + SUB - sa*da | |
498 val2_1 = com1 + sub1; | |
499 val2_2 = com2 + sub2; | |
500 | |
501 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | |
502 #ifdef SK_CPU_ARM64 | |
503 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); | |
504 #else | |
505 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | |
506 #endif | |
507 | |
508 // Insert where needed | |
509 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | |
510 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | |
511 | |
512 // Call the clamp_div255round function | |
513 return clamp_div255round_simd8_32(val1_1, val1_2); | |
514 } | |
515 | |
516 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | |
517 uint8x8_t sa, uint8x8_t da) { | |
518 return overlay_hardlight_color<true>(sc, dc, sa, da); | |
519 } | |
520 | |
521 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
522 uint8x8x4_t ret; | |
523 | |
524 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
525 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], | |
526 src.val[NEON_A], dst.val[NEON_A]); | |
527 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], | |
528 src.val[NEON_A], dst.val[NEON_A]); | |
529 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], | |
530 src.val[NEON_A], dst.val[NEON_A]); | |
531 | |
532 return ret; | |
533 } | |
534 | |
535 template <bool lighten> | |
536 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, | |
537 uint8x8_t sa, uint8x8_t da) { | |
538 uint16x8_t sd, ds, cmp, tmp, tmp2; | |
539 | |
540 // Prepare | |
541 sd = vmull_u8(sc, da); | |
542 ds = vmull_u8(dc, sa); | |
543 | |
544 // Do test | |
545 if (lighten) { | |
546 cmp = vcgtq_u16(sd, ds); | |
547 } else { | |
548 cmp = vcltq_u16(sd, ds); | |
549 } | |
550 | |
551 // Assign if | |
552 tmp = vaddl_u8(sc, dc); | |
553 tmp2 = tmp; | |
554 tmp -= SkDiv255Round_neon8_16_16(ds); | |
555 | |
556 // Calc else | |
557 tmp2 -= SkDiv255Round_neon8_16_16(sd); | |
558 | |
559 // Insert where needed | |
560 tmp = vbslq_u16(cmp, tmp, tmp2); | |
561 | |
562 return vmovn_u16(tmp); | |
563 } | |
564 | |
565 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, | |
566 uint8x8_t sa, uint8x8_t da) { | |
567 return lighten_darken_color<false>(sc, dc, sa, da); | |
568 } | |
569 | |
570 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
571 uint8x8x4_t ret; | |
572 | |
573 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
574 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], | |
575 src.val[NEON_A], dst.val[NEON_A]); | |
576 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], | |
577 src.val[NEON_A], dst.val[NEON_A]); | |
578 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], | |
579 src.val[NEON_A], dst.val[NEON_A]); | |
580 | |
581 return ret; | |
582 } | |
583 | |
584 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, | |
585 uint8x8_t sa, uint8x8_t da) { | |
586 return lighten_darken_color<true>(sc, dc, sa, da); | |
587 } | |
588 | |
589 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
590 uint8x8x4_t ret; | |
591 | |
592 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
593 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], | |
594 src.val[NEON_A], dst.val[NEON_A]); | |
595 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], | |
596 src.val[NEON_A], dst.val[NEON_A]); | |
597 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], | |
598 src.val[NEON_A], dst.val[NEON_A]); | |
599 | |
600 return ret; | |
601 } | |
602 | |
603 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
604 uint8x8_t sa, uint8x8_t da) { | |
605 return overlay_hardlight_color<false>(sc, dc, sa, da); | |
606 } | |
607 | |
608 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
609 uint8x8x4_t ret; | |
610 | |
611 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
612 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], | |
613 src.val[NEON_A], dst.val[NEON_A]); | |
614 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], | |
615 src.val[NEON_A], dst.val[NEON_A]); | |
616 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], | |
617 src.val[NEON_A], dst.val[NEON_A]); | |
618 | |
619 return ret; | |
620 } | |
621 | |
622 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, | |
623 uint8x8_t sa, uint8x8_t da) { | |
624 uint16x8_t sd, ds, tmp; | |
625 int16x8_t val; | |
626 | |
627 sd = vmull_u8(sc, da); | |
628 ds = vmull_u8(dc, sa); | |
629 | |
630 tmp = vminq_u16(sd, ds); | |
631 tmp = SkDiv255Round_neon8_16_16(tmp); | |
632 tmp = vshlq_n_u16(tmp, 1); | |
633 | |
634 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); | |
635 | |
636 val -= vreinterpretq_s16_u16(tmp); | |
637 | |
638 val = vmaxq_s16(val, vdupq_n_s16(0)); | |
639 val = vminq_s16(val, vdupq_n_s16(255)); | |
640 | |
641 return vmovn_u16(vreinterpretq_u16_s16(val)); | |
642 } | |
643 | |
644 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
645 uint8x8x4_t ret; | |
646 | |
647 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
648 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], | |
649 src.val[NEON_A], dst.val[NEON_A]); | |
650 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], | |
651 src.val[NEON_A], dst.val[NEON_A]); | |
652 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], | |
653 src.val[NEON_A], dst.val[NEON_A]); | |
654 | |
655 return ret; | |
656 } | |
657 | |
658 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, | |
659 uint8x8_t sa, uint8x8_t da) { | |
660 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ | |
661 | |
662 uint16x8_t sc_plus_dc, scdc, const255; | |
663 int32x4_t term1_1, term1_2, term2_1, term2_2; | |
664 | |
665 /* Calc (sc + dc) and (sc * dc) */ | |
666 sc_plus_dc = vaddl_u8(sc, dc); | |
667 scdc = vmull_u8(sc, dc); | |
668 | |
669 /* Prepare constants */ | |
670 const255 = vdupq_n_u16(255); | |
671 | |
672 /* Calc the first term */ | |
673 term1_1 = vreinterpretq_s32_u32( | |
674 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
675 term1_2 = vreinterpretq_s32_u32( | |
676 #ifdef SK_CPU_ARM64 | |
677 vmull_high_u16(const255, sc_plus_dc)); | |
678 #else | |
679 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
680 #endif | |
681 | |
682 /* Calc the second term */ | |
683 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | |
684 #ifdef SK_CPU_ARM64 | |
685 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); | |
686 #else | |
687 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | |
688 #endif | |
689 | |
690 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | |
691 } | |
692 | |
693 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
694 uint8x8x4_t ret; | |
695 | |
696 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
697 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | |
698 src.val[NEON_A], dst.val[NEON_A]); | |
699 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | |
700 src.val[NEON_A], dst.val[NEON_A]); | |
701 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | |
702 src.val[NEON_A], dst.val[NEON_A]); | |
703 | |
704 return ret; | |
705 } | |
706 | |
707 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | |
708 uint8x8_t sa, uint8x8_t da) { | |
709 uint32x4_t val1, val2; | |
710 uint16x8_t scdc, t1, t2; | |
711 | |
712 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | |
713 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | |
714 scdc = vmull_u8(sc, dc); | |
715 | |
716 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | |
717 #ifdef SK_CPU_ARM64 | |
718 val2 = vaddl_high_u16(t1, t2); | |
719 #else | |
720 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | |
721 #endif | |
722 | |
723 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | |
724 #ifdef SK_CPU_ARM64 | |
725 val2 = vaddw_high_u16(val2, scdc); | |
726 #else | |
727 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | |
728 #endif | |
729 | |
730 return clamp_div255round_simd8_32( | |
731 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | |
732 } | |
733 | |
734 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
735 uint8x8x4_t ret; | |
736 | |
737 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
738 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | |
739 src.val[NEON_A], dst.val[NEON_A])
; | |
740 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], | |
741 src.val[NEON_A], dst.val[NEON_A])
; | |
742 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], | |
743 src.val[NEON_A], dst.val[NEON_A])
; | |
744 | |
745 return ret; | |
746 } | |
747 | |
748 //////////////////////////////////////////////////////////////////////////////// | |
749 | |
750 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | |
751 | |
752 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; | |
753 | |
754 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, | |
755 const SkPMColor* SK_RESTRICT src, int count
, | |
756 const SkAlpha* SK_RESTRICT aa) const { | |
757 SkASSERT(dst && src && count >= 0); | |
758 | |
759 SkXfermodeProc proc = this->getProc(); | |
760 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | |
761 SkASSERT(procSIMD != NULL); | |
762 | |
763 if (NULL == aa) { | |
764 // Unrolled NEON code | |
765 // We'd like to just do this (modulo a few casts): | |
766 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); | |
767 // src += 8; | |
768 // dst += 8; | |
769 // but that tends to generate miserable code. Here are a bunch of faster | |
770 // workarounds for different architectures and compilers. | |
771 while (count >= 8) { | |
772 | |
773 #ifdef SK_CPU_ARM32 | |
774 uint8x8x4_t vsrc, vdst, vres; | |
775 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
776 asm volatile ( | |
777 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
778 "vld4.u8 %h[vdst], [%[dst]] \t\n" | |
779 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | |
780 : [dst] "r" (dst) | |
781 : | |
782 ); | |
783 #else | |
784 register uint8x8_t d0 asm("d0"); | |
785 register uint8x8_t d1 asm("d1"); | |
786 register uint8x8_t d2 asm("d2"); | |
787 register uint8x8_t d3 asm("d3"); | |
788 register uint8x8_t d4 asm("d4"); | |
789 register uint8x8_t d5 asm("d5"); | |
790 register uint8x8_t d6 asm("d6"); | |
791 register uint8x8_t d7 asm("d7"); | |
792 | |
793 asm volatile ( | |
794 "vld4.u8 {d0-d3},[%[src]]!;" | |
795 "vld4.u8 {d4-d7},[%[dst]];" | |
796 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | |
797 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | |
798 [src] "+&r" (src) | |
799 : [dst] "r" (dst) | |
800 : | |
801 ); | |
802 vsrc.val[0] = d0; vdst.val[0] = d4; | |
803 vsrc.val[1] = d1; vdst.val[1] = d5; | |
804 vsrc.val[2] = d2; vdst.val[2] = d6; | |
805 vsrc.val[3] = d3; vdst.val[3] = d7; | |
806 #endif | |
807 | |
808 vres = procSIMD(vsrc, vdst); | |
809 | |
810 vst4_u8((uint8_t*)dst, vres); | |
811 | |
812 dst += 8; | |
813 | |
814 #else // #ifdef SK_CPU_ARM32 | |
815 | |
816 asm volatile ( | |
817 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
818 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" | |
819 "blr %[proc] \t\n" | |
820 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" | |
821 : [src] "+&r" (src), [dst] "+&r" (dst) | |
822 : [proc] "r" (procSIMD) | |
823 : "cc", "memory", | |
824 /* We don't know what proc is going to clobber so we must | |
825 * add everything that is not callee-saved. | |
826 */ | |
827 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", | |
828 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", | |
829 "x30", /* x30 implicitly clobbered by blr */ | |
830 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | |
831 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |
832 "v27", "v28", "v29", "v30", "v31" | |
833 ); | |
834 | |
835 #endif // #ifdef SK_CPU_ARM32 | |
836 | |
837 count -= 8; | |
838 } | |
839 // Leftovers | |
840 for (int i = 0; i < count; i++) { | |
841 dst[i] = proc(src[i], dst[i]); | |
842 } | |
843 } else { | |
844 for (int i = count - 1; i >= 0; --i) { | |
845 unsigned a = aa[i]; | |
846 if (0 != a) { | |
847 SkPMColor dstC = dst[i]; | |
848 SkPMColor C = proc(src[i], dstC); | |
849 if (a != 0xFF) { | |
850 C = SkFourByteInterp_neon(C, dstC, a); | |
851 } | |
852 dst[i] = C; | |
853 } | |
854 } | |
855 } | |
856 } | |
857 | |
858 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, | |
859 const SkPMColor* SK_RESTRICT src, int count
, | |
860 const SkAlpha* SK_RESTRICT aa) const { | |
861 SkASSERT(dst && src && count >= 0); | |
862 | |
863 SkXfermodeProc proc = this->getProc(); | |
864 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | |
865 SkASSERT(procSIMD != NULL); | |
866 | |
867 if (NULL == aa) { | |
868 while(count >= 8) { | |
869 uint16x8_t vdst, vres16; | |
870 uint8x8x4_t vdst32, vsrc, vres; | |
871 | |
872 vdst = vld1q_u16(dst); | |
873 | |
874 #ifdef SK_CPU_ARM64 | |
875 vsrc = vld4_u8((uint8_t*)src); | |
876 #else | |
877 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
878 asm volatile ( | |
879 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
880 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | |
881 : : | |
882 ); | |
883 #else | |
884 register uint8x8_t d0 asm("d0"); | |
885 register uint8x8_t d1 asm("d1"); | |
886 register uint8x8_t d2 asm("d2"); | |
887 register uint8x8_t d3 asm("d3"); | |
888 | |
889 asm volatile ( | |
890 "vld4.u8 {d0-d3},[%[src]]!;" | |
891 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | |
892 [src] "+&r" (src) | |
893 : : | |
894 ); | |
895 vsrc.val[0] = d0; | |
896 vsrc.val[1] = d1; | |
897 vsrc.val[2] = d2; | |
898 vsrc.val[3] = d3; | |
899 #endif | |
900 #endif // #ifdef SK_CPU_ARM64 | |
901 | |
902 vdst32 = SkPixel16ToPixel32_neon8(vdst); | |
903 vres = procSIMD(vsrc, vdst32); | |
904 vres16 = SkPixel32ToPixel16_neon8(vres); | |
905 | |
906 vst1q_u16(dst, vres16); | |
907 | |
908 count -= 8; | |
909 dst += 8; | |
910 #ifdef SK_CPU_ARM64 | |
911 src += 8; | |
912 #endif | |
913 } | |
914 for (int i = 0; i < count; i++) { | |
915 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | |
916 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | |
917 } | |
918 } else { | |
919 for (int i = count - 1; i >= 0; --i) { | |
920 unsigned a = aa[i]; | |
921 if (0 != a) { | |
922 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | |
923 SkPMColor C = proc(src[i], dstC); | |
924 if (0xFF != a) { | |
925 C = SkFourByteInterp_neon(C, dstC, a); | |
926 } | |
927 dst[i] = SkPixel32ToPixel16_ToU16(C); | |
928 } | |
929 } | |
930 } | |
931 } | |
932 | |
933 #ifndef SK_IGNORE_TO_STRING | |
934 void SkNEONProcCoeffXfermode::toString(SkString* str) const { | |
935 this->INHERITED::toString(str); | |
936 } | |
937 #endif | |
938 | |
939 //////////////////////////////////////////////////////////////////////////////// | |
940 | |
941 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { | |
942 NULL, // kClear_Mode | |
943 NULL, // kSrc_Mode | |
944 NULL, // kDst_Mode | |
945 NULL, // kSrcOver_Mode | |
946 dstover_modeproc_neon8, | |
947 srcin_modeproc_neon8, | |
948 dstin_modeproc_neon8, | |
949 srcout_modeproc_neon8, | |
950 dstout_modeproc_neon8, | |
951 srcatop_modeproc_neon8, | |
952 dstatop_modeproc_neon8, | |
953 xor_modeproc_neon8, | |
954 plus_modeproc_neon8, | |
955 modulate_modeproc_neon8, | |
956 screen_modeproc_neon8, | |
957 | |
958 overlay_modeproc_neon8, | |
959 darken_modeproc_neon8, | |
960 lighten_modeproc_neon8, | |
961 NULL, // kColorDodge_Mode | |
962 NULL, // kColorBurn_Mode | |
963 hardlight_modeproc_neon8, | |
964 NULL, // kSoftLight_Mode | |
965 difference_modeproc_neon8, | |
966 exclusion_modeproc_neon8, | |
967 multiply_modeproc_neon8, | |
968 | |
969 NULL, // kHue_Mode | |
970 NULL, // kSaturation_Mode | |
971 NULL, // kColor_Mode | |
972 NULL, // kLuminosity_Mode | |
973 }; | |
974 | |
975 SK_COMPILE_ASSERT( | |
976 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, | |
977 mode_count_arm | |
978 ); | |
979 | |
980 SkXfermodeProc gNEONXfermodeProcs1[] = { | |
981 NULL, // kClear_Mode | |
982 NULL, // kSrc_Mode | |
983 NULL, // kDst_Mode | |
984 NULL, // kSrcOver_Mode | |
985 NULL, // kDstOver_Mode | |
986 NULL, // kSrcIn_Mode | |
987 NULL, // kDstIn_Mode | |
988 NULL, // kSrcOut_Mode | |
989 NULL, // kDstOut_Mode | |
990 srcatop_modeproc_neon, | |
991 dstatop_modeproc_neon, | |
992 xor_modeproc_neon, | |
993 plus_modeproc_neon, | |
994 modulate_modeproc_neon, | |
995 NULL, // kScreen_Mode | |
996 | |
997 NULL, // kOverlay_Mode | |
998 NULL, // kDarken_Mode | |
999 NULL, // kLighten_Mode | |
1000 NULL, // kColorDodge_Mode | |
1001 NULL, // kColorBurn_Mode | |
1002 NULL, // kHardLight_Mode | |
1003 NULL, // kSoftLight_Mode | |
1004 NULL, // kDifference_Mode | |
1005 NULL, // kExclusion_Mode | |
1006 NULL, // kMultiply_Mode | |
1007 | |
1008 NULL, // kHue_Mode | |
1009 NULL, // kSaturation_Mode | |
1010 NULL, // kColor_Mode | |
1011 NULL, // kLuminosity_Mode | |
1012 }; | |
1013 | |
1014 SK_COMPILE_ASSERT( | |
1015 SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, | |
1016 mode1_count_arm | |
1017 ); | |
1018 | |
1019 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, | |
1020 SkXfermode::Mode mode)
{ | |
1021 if (auto xfermode = SkCreate4pxXfermode(rec, mode)) { | |
1022 return xfermode; | |
1023 } | |
1024 // TODO: Sk4pxXfermode now covers every mode found in this file. Delete the
m all! | |
1025 if (auto proc = gNEONXfermodeProcs[mode]) { | |
1026 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, (void*)proc)); | |
1027 } | |
1028 return NULL; | |
1029 } | |
1030 | |
1031 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | |
1032 return gNEONXfermodeProcs1[mode]; | |
1033 } | |
OLD | NEW |