OLD | NEW |
---|---|
(Empty) | |
1 #include "SkXfermode.h" | |
2 #include "SkXfermode_proccoeff.h" | |
3 #include "SkColorPriv.h" | |
4 | |
5 #include <arm_neon.h> | |
6 #include "SkColor_opts_neon.h" | |
7 #include "SkXfermode_opts_arm_neon.h" | |
8 | |
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | |
10 | |
11 | |
12 //////////////////////////////////////////////////////////////////////////////// | |
13 // NEONized skia functions | |
14 //////////////////////////////////////////////////////////////////////////////// | |
15 | |
16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { | |
17 uint16x8_t tmp; | |
18 uint8x8_t ret; | |
19 | |
20 tmp = vmull_u8(color, alpha); | |
21 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); | |
22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); | |
23 | |
24 ret = vshrn_n_u16(tmp, 8); | |
25 | |
26 return ret; | |
27 } | |
28 | |
29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp ha) { | |
30 uint16x8_t ret; | |
31 | |
32 ret = vmull_u8(color, alpha); | |
33 ret = vaddq_u16(ret, vdupq_n_u16(128)); | |
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | |
35 | |
36 ret = vshrq_n_u16(ret, 8); | |
37 | |
38 return ret; | |
39 } | |
40 | |
41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | |
42 uint16x8_t tmp; | |
43 | |
44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
45 vmovn_u32(vreinterpretq_u32_s32(p2))); | |
46 | |
47 tmp += vdupq_n_u16(128); | |
48 tmp += vshrq_n_u16(tmp, 8); | |
49 | |
50 return vshrn_n_u16(tmp, 8); | |
51 } | |
52 | |
53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | |
54 prod += vdupq_n_u16(128); | |
55 prod += vshrq_n_u16(prod, 8); | |
56 | |
57 return vshrq_n_u16(prod, 8); | |
58 } | |
59 | |
60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) { | |
61 uint8x8_t ret; | |
62 uint32x4_t cmp1, cmp2; | |
63 uint16x8_t cmp16; | |
64 uint8x8_t cmp8, cmp8_1; | |
65 | |
66 // Test if <= 0 | |
67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | |
68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | |
69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
70 cmp8_1 = vmovn_u16(cmp16); | |
71 | |
72 // Init to zero | |
73 ret = vdup_n_u8(0); | |
74 | |
75 // Test if >= 255*255 | |
76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | |
77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | |
78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
79 cmp8 = vmovn_u16(cmp16); | |
80 | |
81 // Insert 255 where true | |
82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | |
83 | |
84 // Calc SkDiv255Round | |
85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | |
86 | |
87 // Insert where false and previous test false | |
88 cmp8 = cmp8 | cmp8_1; | |
89 ret = vbsl_u8(cmp8, ret, div); | |
90 | |
91 // Return the final combination | |
92 return ret; | |
93 } | |
94 | |
95 //////////////////////////////////////////////////////////////////////////////// | |
96 // 8 pixels modeprocs | |
97 //////////////////////////////////////////////////////////////////////////////// | |
98 | |
99 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
100 uint8x8x4_t ret; | |
101 uint16x8_t src_scale; | |
102 | |
103 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
104 | |
105 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_sc ale); | |
106 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_sc ale); | |
107 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_sc ale); | |
108 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_sc ale); | |
109 | |
110 return ret; | |
111 } | |
112 | |
113 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
114 uint8x8x4_t ret; | |
115 uint16x8_t scale; | |
116 | |
117 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); | |
118 | |
119 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); | |
120 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); | |
121 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); | |
122 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); | |
123 | |
124 return ret; | |
125 } | |
126 | |
127 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
128 uint8x8x4_t ret; | |
129 uint16x8_t scale; | |
130 | |
131 scale = SkAlpha255To256_neon8(src.val[NEON_A]); | |
132 | |
133 ret = SkAlphaMulQ_neon8(dst, scale); | |
134 | |
135 return ret; | |
136 } | |
137 | |
138 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
139 uint8x8x4_t ret; | |
140 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
141 | |
142 ret = SkAlphaMulQ_neon8(src, scale); | |
143 | |
144 return ret; | |
145 } | |
146 | |
147 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
148 uint8x8x4_t ret; | |
149 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); | |
150 | |
151 ret = SkAlphaMulQ_neon8(dst, scale); | |
152 | |
153 return ret; | |
154 } | |
155 | |
156 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
157 uint8x8x4_t ret; | |
158 uint8x8_t isa; | |
159 | |
160 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
161 | |
162 ret.val[NEON_A] = dst.val[NEON_A]; | |
163 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) | |
164 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
165 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) | |
166 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
167 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) | |
168 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
169 | |
170 return ret; | |
171 } | |
172 | |
173 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
174 uint8x8x4_t ret; | |
175 uint8x8_t ida; | |
176 | |
177 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
178 | |
179 ret.val[NEON_A] = src.val[NEON_A]; | |
180 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
181 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); | |
182 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
183 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); | |
184 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
185 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); | |
186 | |
187 return ret; | |
188 } | |
189 | |
190 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
191 uint8x8x4_t ret; | |
192 uint8x8_t isa, ida; | |
193 uint16x8_t tmp_wide, tmp_wide2; | |
194 | |
195 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
196 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
197 | |
198 // First calc alpha | |
199 tmp_wide = vmovl_u8(src.val[NEON_A]); | |
200 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); | |
201 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A ]), 1); | |
202 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); | |
203 ret.val[NEON_A] = vmovn_u16(tmp_wide); | |
204 | |
205 // Then colors | |
206 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
207 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
208 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
209 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
210 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
211 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
212 | |
213 return ret; | |
214 } | |
215 | |
216 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
217 uint8x8x4_t ret; | |
218 | |
219 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); | |
220 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); | |
221 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); | |
222 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); | |
223 | |
224 return ret; | |
225 } | |
226 | |
227 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
228 uint8x8x4_t ret; | |
229 | |
230 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); | |
231 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); | |
232 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); | |
233 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); | |
234 | |
235 return ret; | |
236 } | |
237 | |
238 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { | |
239 uint16x8_t tmp; | |
240 | |
241 tmp = vaddl_u8(a, b); | |
242 tmp -= SkAlphaMulAlpha_neon8_16(a, b); | |
243 | |
244 return vmovn_u16(tmp); | |
245 } | |
246 | |
247 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
248 uint8x8x4_t ret; | |
249 | |
250 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
251 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); | |
252 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); | |
253 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); | |
254 | |
255 return ret; | |
256 } | |
257 | |
258 template <bool overlay> | |
259 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
260 uint8x8_t sa, uint8x8_t da) { | |
261 /* | |
262 * In the end we're gonna use (rc + tmp) with a different rc | |
263 * coming from an alternative. | |
264 * The whole value (rc + tmp) can always be expressed as | |
265 * VAL = COM - SUB in the if case | |
266 * VAL = COM + SUB - sa*da in the else case | |
267 * | |
268 * with COM = 255 * (sc + dc) | |
269 * and SUB = sc*da + dc*sa - 2*dc*sc | |
270 */ | |
271 | |
272 // Prepare common subexpressions | |
273 uint16x8_t const255 = vdupq_n_u16(255); | |
274 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); | |
275 uint16x8_t scda = vmull_u8(sc, da); | |
276 uint16x8_t dcsa = vmull_u8(dc, sa); | |
277 uint16x8_t sada = vmull_u8(sa, da); | |
278 | |
279 // Prepare non common subexpressions | |
280 uint16x8_t dc2, sc2; | |
281 uint32x4_t scdc2_1, scdc2_2; | |
282 if (overlay) { | |
283 dc2 = vshll_n_u8(dc, 1); | |
284 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | |
285 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | |
286 } else { | |
287 sc2 = vshll_n_u8(sc, 1); | |
288 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | |
289 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | |
290 } | |
291 | |
292 // Calc COM | |
293 int32x4_t com1, com2; | |
294 com1 = vreinterpretq_s32_u32( | |
295 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
296 com2 = vreinterpretq_s32_u32( | |
297 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
298 | |
299 // Calc SUB | |
300 int32x4_t sub1, sub2; | |
301 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa ))); | |
302 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa))); | |
303 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | |
304 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | |
305 | |
306 // Compare 2*dc <= da | |
307 uint16x8_t cmp; | |
308 | |
309 if (overlay) { | |
310 cmp = vcleq_u16(dc2, vmovl_u8(da)); | |
311 } else { | |
312 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | |
313 } | |
314 | |
315 // Prepare variables | |
316 int32x4_t val1_1, val1_2; | |
317 int32x4_t val2_1, val2_2; | |
318 uint32x4_t cmp1, cmp2; | |
319 | |
320 cmp1 = vmovl_u16(vget_low_u16(cmp)); | |
321 cmp1 |= vshlq_n_u32(cmp1, 16); | |
322 cmp2 = vmovl_u16(vget_high_u16(cmp)); | |
323 cmp2 |= vshlq_n_u32(cmp2, 16); | |
324 | |
325 // Calc COM - SUB | |
326 val1_1 = com1 - sub1; | |
327 val1_2 = com2 - sub2; | |
328 | |
329 // Calc COM + SUB - sa*da | |
330 val2_1 = com1 + sub1; | |
331 val2_2 = com2 + sub2; | |
332 | |
333 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada )))); | |
334 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a)))); | |
335 | |
336 // Insert where needed | |
337 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | |
338 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | |
339 | |
340 // Call the clamp_div255round function | |
341 return clamp_div255round_simd8_32(val1_1, val1_2); | |
342 } | |
343 | |
344 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | |
345 uint8x8_t sa, uint8x8_t da) { | |
346 return overlay_hardlight_color<true>(sc, dc, sa, da); | |
347 } | |
348 | |
349 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
350 uint8x8x4_t ret; | |
351 | |
352 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
353 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], | |
354 src.val[NEON_A], dst.val[NEON_A]); | |
355 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], | |
356 src.val[NEON_A], dst.val[NEON_A]); | |
357 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], | |
358 src.val[NEON_A], dst.val[NEON_A]); | |
359 | |
360 return ret; | |
361 } | |
362 | |
363 template <bool lighten> | |
364 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, | |
365 uint8x8_t sa, uint8x8_t da) { | |
366 uint16x8_t sd, ds, cmp, tmp, tmp2; | |
367 | |
368 // Prepare | |
369 sd = vmull_u8(sc, da); | |
370 ds = vmull_u8(dc, sa); | |
371 | |
372 // Do test | |
373 if (lighten) { | |
374 cmp = vcgtq_u16(sd, ds); | |
375 } else { | |
376 cmp = vcltq_u16(sd, ds); | |
377 } | |
378 | |
379 // Assign if | |
380 tmp = vaddl_u8(sc, dc); | |
381 tmp2 = tmp; | |
382 tmp -= SkDiv255Round_neon8_16_16(ds); | |
383 | |
384 // Calc else | |
385 tmp2 -= SkDiv255Round_neon8_16_16(sd); | |
386 | |
387 // Insert where needed | |
388 tmp = vbslq_u16(cmp, tmp, tmp2); | |
389 | |
390 return vmovn_u16(tmp); | |
391 } | |
392 | |
393 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, | |
394 uint8x8_t sa, uint8x8_t da) { | |
395 return lighten_darken_color<false>(sc, dc, sa, da); | |
396 } | |
397 | |
398 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
399 uint8x8x4_t ret; | |
400 | |
401 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
402 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], | |
403 src.val[NEON_A], dst.val[NEON_A]); | |
404 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], | |
405 src.val[NEON_A], dst.val[NEON_A]); | |
406 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], | |
407 src.val[NEON_A], dst.val[NEON_A]); | |
408 | |
409 return ret; | |
410 } | |
411 | |
412 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, | |
413 uint8x8_t sa, uint8x8_t da) { | |
414 return lighten_darken_color<true>(sc, dc, sa, da); | |
415 } | |
416 | |
417 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
418 uint8x8x4_t ret; | |
419 | |
420 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
421 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], | |
422 src.val[NEON_A], dst.val[NEON_A]); | |
423 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], | |
424 src.val[NEON_A], dst.val[NEON_A]); | |
425 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], | |
426 src.val[NEON_A], dst.val[NEON_A]); | |
427 | |
428 return ret; | |
429 } | |
430 | |
431 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
432 uint8x8_t sa, uint8x8_t da) { | |
433 return overlay_hardlight_color<false>(sc, dc, sa, da); | |
434 } | |
435 | |
436 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
437 uint8x8x4_t ret; | |
438 | |
439 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
440 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], | |
441 src.val[NEON_A], dst.val[NEON_A]); | |
442 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], | |
443 src.val[NEON_A], dst.val[NEON_A]); | |
444 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], | |
445 src.val[NEON_A], dst.val[NEON_A]); | |
446 | |
447 return ret; | |
448 } | |
449 | |
450 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, | |
451 uint8x8_t sa, uint8x8_t da) { | |
452 uint16x8_t sd, ds, tmp; | |
453 int16x8_t val; | |
454 | |
455 sd = vmull_u8(sc, da); | |
456 ds = vmull_u8(dc, sa); | |
457 | |
458 tmp = vminq_u16(sd, ds); | |
459 tmp = SkDiv255Round_neon8_16_16(tmp); | |
460 tmp = vshlq_n_u16(tmp, 1); | |
461 | |
462 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); | |
463 | |
464 val -= vreinterpretq_s16_u16(tmp); | |
465 | |
466 val = vmaxq_s16(val, vdupq_n_s16(0)); | |
467 val = vminq_s16(val, vdupq_n_s16(255)); | |
468 | |
469 return vmovn_u16(vreinterpretq_u16_s16(val)); | |
470 } | |
471 | |
472 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
473 uint8x8x4_t ret; | |
474 | |
475 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
476 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], | |
477 src.val[NEON_A], dst.val[NEON_A]); | |
478 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], | |
479 src.val[NEON_A], dst.val[NEON_A]); | |
480 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], | |
481 src.val[NEON_A], dst.val[NEON_A]); | |
482 | |
483 return ret; | |
484 } | |
485 | |
486 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, | |
487 uint8x8_t sa, uint8x8_t da) { | |
488 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ | |
489 | |
490 uint16x8_t sc_plus_dc, scdc, const255; | |
491 int32x4_t term1_1, term1_2, term2_1, term2_2; | |
492 | |
493 /* Calc (sc + dc) and (sc * dc) */ | |
494 sc_plus_dc = vaddl_u8(sc, dc); | |
495 scdc = vmull_u8(sc, dc); | |
496 | |
497 /* Prepare constants */ | |
498 const255 = vdupq_n_u16(255); | |
499 | |
500 /* Calc the first term */ | |
501 term1_1 = vreinterpretq_s32_u32( | |
502 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
503 term1_2 = vreinterpretq_s32_u32( | |
504 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
505 | |
506 /* Calc the second term */ | |
507 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | |
508 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | |
509 | |
510 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | |
511 } | |
512 | |
513 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
514 uint8x8x4_t ret; | |
515 | |
516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
517 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | |
518 src.val[NEON_A], dst.val[NEON_A]); | |
519 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | |
520 src.val[NEON_A], dst.val[NEON_A]); | |
521 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | |
522 src.val[NEON_A], dst.val[NEON_A]); | |
523 | |
524 return ret; | |
525 } | |
526 | |
527 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | |
528 uint8x8_t sa, uint8x8_t da) { | |
529 uint32x4_t val1, val2; | |
530 uint16x8_t scdc, t1, t2; | |
531 | |
532 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | |
533 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | |
534 scdc = vmull_u8(sc, dc); | |
535 | |
536 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | |
537 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | |
538 | |
539 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | |
540 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | |
541 | |
542 return clamp_div255round_simd8_32( | |
543 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | |
544 } | |
545 | |
546 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
547 uint8x8x4_t ret; | |
548 | |
549 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
550 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | |
551 src.val[NEON_A], dst.val[NEON_A]) ; | |
552 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], | |
553 src.val[NEON_A], dst.val[NEON_A]) ; | |
554 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], | |
555 src.val[NEON_A], dst.val[NEON_A]) ; | |
556 | |
557 return ret; | |
558 } | |
559 | |
560 //////////////////////////////////////////////////////////////////////////////// | |
561 | |
562 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | |
563 | |
564 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; | |
565 | |
566 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer ) | |
567 : INHERITED(buffer) { | |
568 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); | |
569 } | |
570 | |
571 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], | |
572 int count, const SkAlpha aa[]) const { | |
573 SkASSERT(dst && src && count >= 0); | |
574 | |
575 SkXfermodeProc proc = this->getProc(); | |
576 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD ); | |
577 | |
578 if (NULL == aa) { | |
579 // Unrolled NEON code | |
580 while (count >= 8) { | |
581 uint8x8x4_t vsrc, vdst, vres; | |
582 | |
583 #if (__GNUC__ == 4) && (__GNUC_MINOR__ > 6) | |
djsollen
2013/10/17 18:34:18
what happens to this code when we go to version 5.
| |
584 asm volatile ( | |
585 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
586 "vld4.u8 %h[vdst], [%[dst]] \t\n" | |
587 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | |
588 : [dst] "r" (dst) | |
589 : | |
590 ); | |
591 #else | |
592 register uint8x8_t d0 asm("d0"); | |
593 register uint8x8_t d1 asm("d1"); | |
594 register uint8x8_t d2 asm("d2"); | |
595 register uint8x8_t d3 asm("d3"); | |
596 register uint8x8_t d4 asm("d4"); | |
597 register uint8x8_t d5 asm("d5"); | |
598 register uint8x8_t d6 asm("d6"); | |
599 register uint8x8_t d7 asm("d7"); | |
600 | |
601 asm volatile ( | |
602 "vld4.u8 {d0-d3},[%[src]]!;" | |
603 "vld4.u8 {d4-d7},[%[dst]];" | |
604 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | |
605 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | |
606 [src] "+&r" (src) | |
607 : [dst] "r" (dst) | |
608 : | |
609 ); | |
610 vsrc.val[0] = d0; vdst.val[0] = d4; | |
611 vsrc.val[1] = d1; vdst.val[1] = d5; | |
612 vsrc.val[2] = d2; vdst.val[2] = d6; | |
613 vsrc.val[3] = d3; vdst.val[3] = d7; | |
614 #endif | |
615 | |
616 vres = procSIMD(vsrc, vdst); | |
617 | |
618 vst4_u8((uint8_t*)dst, vres); | |
619 | |
620 count -= 8; | |
621 dst += 8; | |
622 } | |
623 // Leftovers | |
624 for (int i = 0; i < count; i++) { | |
625 dst[i] = proc(src[i], dst[i]); | |
626 } | |
627 } else { | |
628 for (int i = count - 1; i >= 0; --i) { | |
629 unsigned a = aa[i]; | |
630 if (0 != a) { | |
631 SkPMColor dstC = dst[i]; | |
632 SkPMColor C = proc(src[i], dstC); | |
633 if (a != 0xFF) { | |
634 C = SkFourByteInterp(C, dstC, a); | |
635 } | |
636 dst[i] = C; | |
637 } | |
638 } | |
639 } | |
640 } | |
641 | |
642 #ifdef SK_DEVELOPER | |
643 void SkNEONProcCoeffXfermode::toString(SkString* str) const { | |
644 this->INHERITED::toString(str); | |
645 } | |
646 #endif | |
647 | |
648 //////////////////////////////////////////////////////////////////////////////// | |
649 | |
650 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { | |
651 NULL, // kClear_Mode | |
652 NULL, // kSrc_Mode | |
653 NULL, // kDst_Mode | |
654 NULL, // kSrcOver_Mode | |
655 dstover_modeproc_neon8, | |
656 srcin_modeproc_neon8, | |
657 dstin_modeproc_neon8, | |
658 srcout_modeproc_neon8, | |
659 dstout_modeproc_neon8, | |
660 srcatop_modeproc_neon8, | |
661 dstatop_modeproc_neon8, | |
662 xor_modeproc_neon8, | |
663 plus_modeproc_neon8, | |
664 modulate_modeproc_neon8, | |
665 screen_modeproc_neon8, | |
666 | |
667 overlay_modeproc_neon8, | |
668 darken_modeproc_neon8, | |
669 lighten_modeproc_neon8, | |
670 NULL, // kColorDodge_Mode | |
671 NULL, // kColorBurn_Mode | |
672 hardlight_modeproc_neon8, | |
673 NULL, // kSoftLight_Mode | |
674 difference_modeproc_neon8, | |
675 exclusion_modeproc_neon8, | |
676 multiply_modeproc_neon8, | |
677 | |
678 NULL, // kHue_Mode | |
679 NULL, // kSaturation_Mode | |
680 NULL, // kColor_Mode | |
681 NULL, // kLuminosity_Mode | |
682 }; | |
683 | |
684 SK_COMPILE_ASSERT( | |
685 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, | |
686 mode_count_arm | |
687 ); | |
688 | |
689 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, | |
690 SkXfermode::Mode mode) { | |
691 | |
692 void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]); | |
693 | |
694 if (procSIMD != NULL) { | |
695 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | |
696 } | |
697 return NULL; | |
698 } | |
OLD | NEW |