OLD | NEW |
1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
4 #include "SkUtilsArm.h" | 4 #include "SkUtilsArm.h" |
5 | 5 |
6 #if !SK_ARM_NEON_IS_NONE | |
7 | |
8 #include <arm_neon.h> | 6 #include <arm_neon.h> |
| 7 #include "SkColor_opts_neon.h" |
| 8 |
| 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
| 10 |
9 | 11 |
10 //////////////////////////////////////////////////////////////////////////////// | 12 //////////////////////////////////////////////////////////////////////////////// |
11 | 13 // NEONized skia functions |
| 14 //////////////////////////////////////////////////////////////////////////////// |
| 15 |
| 16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha)
{ |
| 17 uint16x8_t tmp; |
| 18 uint8x8_t ret; |
| 19 |
| 20 tmp = vmull_u8(color, alpha); |
| 21 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); |
| 22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); |
| 23 |
| 24 ret = vshrn_n_u16(tmp, 8); |
| 25 |
| 26 return ret; |
| 27 } |
| 28 |
| 29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
ha) { |
| 30 uint16x8_t ret; |
| 31 |
| 32 ret = vmull_u8(color, alpha); |
| 33 ret = vaddq_u16(ret, vdupq_n_u16(128)); |
| 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
| 35 |
| 36 ret = vshrq_n_u16(ret, 8); |
| 37 |
| 38 return ret; |
| 39 } |
| 40 |
| 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
| 42 uint16x8_t tmp; |
| 43 |
| 44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
| 45 vmovn_u32(vreinterpretq_u32_s32(p2))); |
| 46 |
| 47 tmp += vdupq_n_u16(128); |
| 48 tmp += vshrq_n_u16(tmp, 8); |
| 49 |
| 50 return vshrn_n_u16(tmp, 8); |
| 51 } |
| 52 |
| 53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
| 54 prod += vdupq_n_u16(128); |
| 55 prod += vshrq_n_u16(prod, 8); |
| 56 |
| 57 return vshrq_n_u16(prod, 8); |
| 58 } |
| 59 |
| 60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { |
| 61 uint8x8_t ret; |
| 62 uint32x4_t cmp1, cmp2; |
| 63 uint16x8_t cmp16; |
| 64 uint8x8_t cmp8, cmp8_1; |
| 65 |
| 66 // Test if <= 0 |
| 67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
| 68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
| 69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 70 cmp8_1 = vmovn_u16(cmp16); |
| 71 |
| 72 // Init to zero |
| 73 ret = vdup_n_u8(0); |
| 74 |
| 75 // Test if >= 255*255 |
| 76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
| 77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
| 78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 79 cmp8 = vmovn_u16(cmp16); |
| 80 |
| 81 // Insert 255 where true |
| 82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
| 83 |
| 84 // Calc SkDiv255Round |
| 85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
| 86 |
| 87 // Insert where false and previous test false |
| 88 cmp8 = cmp8 | cmp8_1; |
| 89 ret = vbsl_u8(cmp8, ret, div); |
| 90 |
| 91 // Return the final combination |
| 92 return ret; |
| 93 } |
| 94 |
| 95 //////////////////////////////////////////////////////////////////////////////// |
| 96 // 8 pixels modeprocs |
| 97 //////////////////////////////////////////////////////////////////////////////// |
| 98 |
| 99 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 100 uint8x8x4_t ret; |
| 101 uint16x8_t src_scale; |
| 102 |
| 103 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); |
| 104 |
| 105 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_sc
ale); |
| 106 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_sc
ale); |
| 107 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_sc
ale); |
| 108 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_sc
ale); |
| 109 |
| 110 return ret; |
| 111 } |
| 112 |
| 113 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 114 uint8x8x4_t ret; |
| 115 uint16x8_t scale; |
| 116 |
| 117 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); |
| 118 |
| 119 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); |
| 120 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); |
| 121 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); |
| 122 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); |
| 123 |
| 124 return ret; |
| 125 } |
| 126 |
| 127 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 128 uint8x8x4_t ret; |
| 129 uint16x8_t scale; |
| 130 |
| 131 scale = SkAlpha255To256_neon8(src.val[NEON_A]); |
| 132 |
| 133 ret = SkAlphaMulQ_neon8(dst, scale); |
| 134 |
| 135 return ret; |
| 136 } |
| 137 |
| 138 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 139 uint8x8x4_t ret; |
| 140 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); |
| 141 |
| 142 ret = SkAlphaMulQ_neon8(src, scale); |
| 143 |
| 144 return ret; |
| 145 } |
| 146 |
| 147 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 148 uint8x8x4_t ret; |
| 149 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); |
| 150 |
| 151 ret = SkAlphaMulQ_neon8(dst, scale); |
| 152 |
| 153 return ret; |
| 154 } |
| 155 |
| 156 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 157 uint8x8x4_t ret; |
| 158 uint8x8_t isa; |
| 159 |
| 160 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); |
| 161 |
| 162 ret.val[NEON_A] = dst.val[NEON_A]; |
| 163 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) |
| 164 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); |
| 165 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) |
| 166 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); |
| 167 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) |
| 168 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); |
| 169 |
| 170 return ret; |
| 171 } |
| 172 |
| 173 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 174 uint8x8x4_t ret; |
| 175 uint8x8_t ida; |
| 176 |
| 177 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); |
| 178 |
| 179 ret.val[NEON_A] = src.val[NEON_A]; |
| 180 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) |
| 181 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); |
| 182 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) |
| 183 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); |
| 184 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) |
| 185 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); |
| 186 |
| 187 return ret; |
| 188 } |
| 189 |
| 190 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 191 uint8x8x4_t ret; |
| 192 uint8x8_t isa, ida; |
| 193 uint16x8_t tmp_wide, tmp_wide2; |
| 194 |
| 195 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); |
| 196 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); |
| 197 |
| 198 // First calc alpha |
| 199 tmp_wide = vmovl_u8(src.val[NEON_A]); |
| 200 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); |
| 201 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A
]), 1); |
| 202 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); |
| 203 ret.val[NEON_A] = vmovn_u16(tmp_wide); |
| 204 |
| 205 // Then colors |
| 206 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) |
| 207 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); |
| 208 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) |
| 209 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); |
| 210 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) |
| 211 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); |
| 212 |
| 213 return ret; |
| 214 } |
| 215 |
| 216 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 217 uint8x8x4_t ret; |
| 218 |
| 219 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); |
| 220 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); |
| 221 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); |
| 222 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); |
| 223 |
| 224 return ret; |
| 225 } |
| 226 |
| 227 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 228 uint8x8x4_t ret; |
| 229 |
| 230 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); |
| 231 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); |
| 232 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); |
| 233 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); |
| 234 |
| 235 return ret; |
| 236 } |
| 237 |
| 238 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { |
| 239 uint16x8_t tmp; |
| 240 |
| 241 tmp = vaddl_u8(a, b); |
| 242 tmp -= SkAlphaMulAlpha_neon8_16(a, b); |
| 243 |
| 244 return vmovn_u16(tmp); |
| 245 } |
| 246 |
| 247 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 248 uint8x8x4_t ret; |
| 249 |
| 250 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 251 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); |
| 252 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); |
| 253 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); |
| 254 |
| 255 return ret; |
| 256 } |
| 257 |
| 258 template <bool overlay> |
| 259 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, |
| 260 uint8x8_t sa, uint8x8_t da) { |
| 261 /* |
| 262 * In the end we're gonna use (rc + tmp) with a different rc |
| 263 * coming from an alternative. |
| 264 * The whole value (rc + tmp) can always be expressed as |
| 265 * VAL = COM - SUB in the if case |
| 266 * VAL = COM + SUB - sa*da in the else case |
| 267 * |
| 268 * with COM = 255 * (sc + dc) |
| 269 * and SUB = sc*da + dc*sa - 2*dc*sc |
| 270 */ |
| 271 |
| 272 // Prepare common subexpressions |
| 273 uint16x8_t const255 = vdupq_n_u16(255); |
| 274 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); |
| 275 uint16x8_t scda = vmull_u8(sc, da); |
| 276 uint16x8_t dcsa = vmull_u8(dc, sa); |
| 277 uint16x8_t sada = vmull_u8(sa, da); |
| 278 |
| 279 // Prepare non common subexpressions |
| 280 uint16x8_t dc2, sc2; |
| 281 uint32x4_t scdc2_1, scdc2_2; |
| 282 if (overlay) { |
| 283 dc2 = vshll_n_u8(dc, 1); |
| 284 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
| 285 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
| 286 } else { |
| 287 sc2 = vshll_n_u8(sc, 1); |
| 288 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
| 289 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
| 290 } |
| 291 |
| 292 // Calc COM |
| 293 int32x4_t com1, com2; |
| 294 com1 = vreinterpretq_s32_u32( |
| 295 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 296 com2 = vreinterpretq_s32_u32( |
| 297 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 298 |
| 299 // Calc SUB |
| 300 int32x4_t sub1, sub2; |
| 301 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); |
| 302 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); |
| 303 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
| 304 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
| 305 |
| 306 // Compare 2*dc <= da |
| 307 uint16x8_t cmp; |
| 308 |
| 309 if (overlay) { |
| 310 cmp = vcleq_u16(dc2, vmovl_u8(da)); |
| 311 } else { |
| 312 cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
| 313 } |
| 314 |
| 315 // Prepare variables |
| 316 int32x4_t val1_1, val1_2; |
| 317 int32x4_t val2_1, val2_2; |
| 318 uint32x4_t cmp1, cmp2; |
| 319 |
| 320 cmp1 = vmovl_u16(vget_low_u16(cmp)); |
| 321 cmp1 |= vshlq_n_u32(cmp1, 16); |
| 322 cmp2 = vmovl_u16(vget_high_u16(cmp)); |
| 323 cmp2 |= vshlq_n_u32(cmp2, 16); |
| 324 |
| 325 // Calc COM - SUB |
| 326 val1_1 = com1 - sub1; |
| 327 val1_2 = com2 - sub2; |
| 328 |
| 329 // Calc COM + SUB - sa*da |
| 330 val2_1 = com1 + sub1; |
| 331 val2_2 = com2 + sub2; |
| 332 |
| 333 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); |
| 334 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); |
| 335 |
| 336 // Insert where needed |
| 337 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
| 338 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
| 339 |
| 340 // Call the clamp_div255round function |
| 341 return clamp_div255round_simd8_32(val1_1, val1_2); |
| 342 } |
| 343 |
| 344 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
| 345 uint8x8_t sa, uint8x8_t da) { |
| 346 return overlay_hardlight_color<true>(sc, dc, sa, da); |
| 347 } |
| 348 |
| 349 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 350 uint8x8x4_t ret; |
| 351 |
| 352 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 353 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], |
| 354 src.val[NEON_A], dst.val[NEON_A]); |
| 355 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], |
| 356 src.val[NEON_A], dst.val[NEON_A]); |
| 357 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], |
| 358 src.val[NEON_A], dst.val[NEON_A]); |
| 359 |
| 360 return ret; |
| 361 } |
| 362 |
| 363 template <bool lighten> |
| 364 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, |
| 365 uint8x8_t sa, uint8x8_t da) { |
| 366 uint16x8_t sd, ds, cmp, tmp, tmp2; |
| 367 |
| 368 // Prepare |
| 369 sd = vmull_u8(sc, da); |
| 370 ds = vmull_u8(dc, sa); |
| 371 |
| 372 // Do test |
| 373 if (lighten) { |
| 374 cmp = vcgtq_u16(sd, ds); |
| 375 } else { |
| 376 cmp = vcltq_u16(sd, ds); |
| 377 } |
| 378 |
| 379 // Assign if |
| 380 tmp = vaddl_u8(sc, dc); |
| 381 tmp2 = tmp; |
| 382 tmp -= SkDiv255Round_neon8_16_16(ds); |
| 383 |
| 384 // Calc else |
| 385 tmp2 -= SkDiv255Round_neon8_16_16(sd); |
| 386 |
| 387 // Insert where needed |
| 388 tmp = vbslq_u16(cmp, tmp, tmp2); |
| 389 |
| 390 return vmovn_u16(tmp); |
| 391 } |
| 392 |
| 393 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, |
| 394 uint8x8_t sa, uint8x8_t da) { |
| 395 return lighten_darken_color<false>(sc, dc, sa, da); |
| 396 } |
| 397 |
| 398 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 399 uint8x8x4_t ret; |
| 400 |
| 401 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 402 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], |
| 403 src.val[NEON_A], dst.val[NEON_A]); |
| 404 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], |
| 405 src.val[NEON_A], dst.val[NEON_A]); |
| 406 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], |
| 407 src.val[NEON_A], dst.val[NEON_A]); |
| 408 |
| 409 return ret; |
| 410 } |
| 411 |
| 412 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, |
| 413 uint8x8_t sa, uint8x8_t da) { |
| 414 return lighten_darken_color<true>(sc, dc, sa, da); |
| 415 } |
| 416 |
| 417 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 418 uint8x8x4_t ret; |
| 419 |
| 420 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 421 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], |
| 422 src.val[NEON_A], dst.val[NEON_A]); |
| 423 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], |
| 424 src.val[NEON_A], dst.val[NEON_A]); |
| 425 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], |
| 426 src.val[NEON_A], dst.val[NEON_A]); |
| 427 |
| 428 return ret; |
| 429 } |
| 430 |
| 431 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, |
| 432 uint8x8_t sa, uint8x8_t da) { |
| 433 return overlay_hardlight_color<false>(sc, dc, sa, da); |
| 434 } |
| 435 |
| 436 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 437 uint8x8x4_t ret; |
| 438 |
| 439 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 440 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], |
| 441 src.val[NEON_A], dst.val[NEON_A]); |
| 442 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], |
| 443 src.val[NEON_A], dst.val[NEON_A]); |
| 444 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], |
| 445 src.val[NEON_A], dst.val[NEON_A]); |
| 446 |
| 447 return ret; |
| 448 } |
| 449 |
| 450 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, |
| 451 uint8x8_t sa, uint8x8_t da) { |
| 452 uint16x8_t sd, ds, tmp; |
| 453 int16x8_t val; |
| 454 |
| 455 sd = vmull_u8(sc, da); |
| 456 ds = vmull_u8(dc, sa); |
| 457 |
| 458 tmp = vminq_u16(sd, ds); |
| 459 tmp = SkDiv255Round_neon8_16_16(tmp); |
| 460 tmp = vshlq_n_u16(tmp, 1); |
| 461 |
| 462 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); |
| 463 |
| 464 val -= vreinterpretq_s16_u16(tmp); |
| 465 |
| 466 val = vmaxq_s16(val, vdupq_n_s16(0)); |
| 467 val = vminq_s16(val, vdupq_n_s16(255)); |
| 468 |
| 469 return vmovn_u16(vreinterpretq_u16_s16(val)); |
| 470 } |
| 471 |
| 472 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 473 uint8x8x4_t ret; |
| 474 |
| 475 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 476 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], |
| 477 src.val[NEON_A], dst.val[NEON_A]); |
| 478 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], |
| 479 src.val[NEON_A], dst.val[NEON_A]); |
| 480 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], |
| 481 src.val[NEON_A], dst.val[NEON_A]); |
| 482 |
| 483 return ret; |
| 484 } |
| 485 |
| 486 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, |
| 487 uint8x8_t sa, uint8x8_t da) { |
| 488 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ |
| 489 |
| 490 uint16x8_t sc_plus_dc, scdc, const255; |
| 491 int32x4_t term1_1, term1_2, term2_1, term2_2; |
| 492 |
| 493 /* Calc (sc + dc) and (sc * dc) */ |
| 494 sc_plus_dc = vaddl_u8(sc, dc); |
| 495 scdc = vmull_u8(sc, dc); |
| 496 |
| 497 /* Prepare constants */ |
| 498 const255 = vdupq_n_u16(255); |
| 499 |
| 500 /* Calc the first term */ |
| 501 term1_1 = vreinterpretq_s32_u32( |
| 502 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 503 term1_2 = vreinterpretq_s32_u32( |
| 504 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 505 |
| 506 /* Calc the second term */ |
| 507 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
| 508 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
| 509 |
| 510 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
| 511 } |
| 512 |
| 513 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 514 uint8x8x4_t ret; |
| 515 |
| 516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 517 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
| 518 src.val[NEON_A], dst.val[NEON_A]); |
| 519 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
| 520 src.val[NEON_A], dst.val[NEON_A]); |
| 521 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
| 522 src.val[NEON_A], dst.val[NEON_A]); |
| 523 |
| 524 return ret; |
| 525 } |
| 526 |
| 527 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
| 528 uint8x8_t sa, uint8x8_t da) { |
| 529 uint32x4_t val1, val2; |
| 530 uint16x8_t scdc, t1, t2; |
| 531 |
| 532 t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
| 533 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
| 534 scdc = vmull_u8(sc, dc); |
| 535 |
| 536 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
| 537 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
| 538 |
| 539 val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
| 540 val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
| 541 |
| 542 return clamp_div255round_simd8_32( |
| 543 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
| 544 } |
| 545 |
| 546 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 547 uint8x8x4_t ret; |
| 548 |
| 549 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 550 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
| 551 src.val[NEON_A], dst.val[NEON_A])
; |
| 552 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], |
| 553 src.val[NEON_A], dst.val[NEON_A])
; |
| 554 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], |
| 555 src.val[NEON_A], dst.val[NEON_A])
; |
| 556 |
| 557 return ret; |
| 558 } |
| 559 |
| 560 //////////////////////////////////////////////////////////////////////////////// |
| 561 |
12 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | 562 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); |
13 | 563 |
14 class SkNEONProcCoeffXfermode : public SkProcCoeffXfermode { | 564 class SkNEONProcCoeffXfermode : public SkProcCoeffXfermode { |
15 public: | 565 public: |
16 SkNEONProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, | 566 SkNEONProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, |
17 SkXfermodeProcSIMD procSIMD) | 567 SkXfermodeProcSIMD procSIMD) |
18 : INHERITED(rec, mode), fProcSIMD(procSIMD) {} | 568 : INHERITED(rec, mode), fProcSIMD(procSIMD) {} |
19 | 569 |
20 virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, | 570 virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, |
21 const SkAlpha aa[]) const SK_OVERRIDE; | 571 const SkAlpha aa[]) const SK_OVERRIDE; |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
99 } | 649 } |
100 } | 650 } |
101 | 651 |
102 //////////////////////////////////////////////////////////////////////////////// | 652 //////////////////////////////////////////////////////////////////////////////// |
103 | 653 |
104 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { | 654 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { |
105 [SkXfermode::kClear_Mode] = NULL, | 655 [SkXfermode::kClear_Mode] = NULL, |
106 [SkXfermode::kSrc_Mode] = NULL, | 656 [SkXfermode::kSrc_Mode] = NULL, |
107 [SkXfermode::kDst_Mode] = NULL, | 657 [SkXfermode::kDst_Mode] = NULL, |
108 [SkXfermode::kSrcOver_Mode] = NULL, | 658 [SkXfermode::kSrcOver_Mode] = NULL, |
109 [SkXfermode::kDstOver_Mode] = NULL, | 659 [SkXfermode::kDstOver_Mode] = dstover_modeproc_neon8, |
110 [SkXfermode::kSrcIn_Mode] = NULL, | 660 [SkXfermode::kSrcIn_Mode] = srcin_modeproc_neon8, |
111 [SkXfermode::kDstIn_Mode] = NULL, | 661 [SkXfermode::kDstIn_Mode] = dstin_modeproc_neon8, |
112 [SkXfermode::kSrcOut_Mode] = NULL, | 662 [SkXfermode::kSrcOut_Mode] = srcout_modeproc_neon8, |
113 [SkXfermode::kDstOut_Mode] = NULL, | 663 [SkXfermode::kDstOut_Mode] = dstout_modeproc_neon8, |
114 [SkXfermode::kSrcATop_Mode] = NULL, | 664 [SkXfermode::kSrcATop_Mode] = srcatop_modeproc_neon8, |
115 [SkXfermode::kDstATop_Mode] = NULL, | 665 [SkXfermode::kDstATop_Mode] = dstatop_modeproc_neon8, |
116 [SkXfermode::kXor_Mode] = NULL, | 666 [SkXfermode::kXor_Mode] = xor_modeproc_neon8, |
117 [SkXfermode::kPlus_Mode] = NULL, | 667 [SkXfermode::kPlus_Mode] = plus_modeproc_neon8, |
118 [SkXfermode::kModulate_Mode]= NULL, | 668 [SkXfermode::kModulate_Mode]= modulate_modeproc_neon8, |
119 [SkXfermode::kScreen_Mode] = NULL, | 669 [SkXfermode::kScreen_Mode] = screen_modeproc_neon8, |
120 | 670 |
121 [SkXfermode::kOverlay_Mode] = NULL, | 671 [SkXfermode::kOverlay_Mode] = overlay_modeproc_neon8, |
122 [SkXfermode::kDarken_Mode] = NULL, | 672 [SkXfermode::kDarken_Mode] = darken_modeproc_neon8, |
123 [SkXfermode::kLighten_Mode] = NULL, | 673 [SkXfermode::kLighten_Mode] = lighten_modeproc_neon8, |
124 [SkXfermode::kColorDodge_Mode] = NULL, | 674 [SkXfermode::kColorDodge_Mode] = NULL, |
125 [SkXfermode::kColorBurn_Mode] = NULL, | 675 [SkXfermode::kColorBurn_Mode] = NULL, |
126 [SkXfermode::kHardLight_Mode] = NULL, | 676 [SkXfermode::kHardLight_Mode] = hardlight_modeproc_neon8, |
127 [SkXfermode::kSoftLight_Mode] = NULL, | 677 [SkXfermode::kSoftLight_Mode] = NULL, |
128 [SkXfermode::kDifference_Mode] = NULL, | 678 [SkXfermode::kDifference_Mode] = difference_modeproc_neon8, |
129 [SkXfermode::kExclusion_Mode] = NULL, | 679 [SkXfermode::kExclusion_Mode] = exclusion_modeproc_neon8, |
130 [SkXfermode::kMultiply_Mode] = NULL, | 680 [SkXfermode::kMultiply_Mode] = multiply_modeproc_neon8, |
131 | 681 |
132 [SkXfermode::kHue_Mode] = NULL, | 682 [SkXfermode::kHue_Mode] = NULL, |
133 [SkXfermode::kSaturation_Mode] = NULL, | 683 [SkXfermode::kSaturation_Mode] = NULL, |
134 [SkXfermode::kColor_Mode] = NULL, | 684 [SkXfermode::kColor_Mode] = NULL, |
135 [SkXfermode::kLuminosity_Mode] = NULL, | 685 [SkXfermode::kLuminosity_Mode] = NULL, |
136 }; | 686 }; |
137 | 687 |
138 SK_COMPILE_ASSERT( | 688 SK_COMPILE_ASSERT( |
139 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, | 689 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, |
140 mode_count_arm | 690 mode_count_arm |
141 ); | 691 ); |
142 | 692 |
143 #endif | |
144 | |
145 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, | 693 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, |
146 SkXfermode::Mode mode) { | 694 SkXfermode::Mode mode) { |
147 #if !SK_ARM_NEON_IS_NONE | 695 #if SK_ARM_NEON_IS_DYNAMIC |
148 #if SK_ARM_NEON_IS_DYNAMIC | |
149 if ((sk_cpu_arm_has_neon()) && (gNEONXfermodeProcs[mode] != NULL)) { | 696 if ((sk_cpu_arm_has_neon()) && (gNEONXfermodeProcs[mode] != NULL)) { |
150 #elif SK_ARM_NEON_IS_ALWAYS | 697 #elif SK_ARM_NEON_IS_ALWAYS |
151 if (gNEONXfermodeProcs[mode] != NULL) { | 698 if (gNEONXfermodeProcs[mode] != NULL) { |
152 #endif | 699 #endif |
153 return SkNEW_ARGS(SkNEONProcCoeffXfermode, | 700 return SkNEW_ARGS(SkNEONProcCoeffXfermode, |
154 (rec, mode, gNEONXfermodeProcs[mode])); | 701 (rec, mode, gNEONXfermodeProcs[mode])); |
155 } | 702 } |
156 #endif | |
157 return NULL; | 703 return NULL; |
158 } | 704 } |
OLD | NEW |