Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #include "SkXfermode.h" | |
| 2 #include "SkXfermode_proccoeff.h" | |
| 3 #include "SkColorPriv.h" | |
| 4 #include "SkUtilsArm.h" | |
| 5 | |
| 6 #include <arm_neon.h> | |
| 7 #include "SkColor_opts_neon.h" | |
| 8 | |
| 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | |
| 10 | |
| 11 | |
| 12 //////////////////////////////////////////////////////////////////////////////// | |
| 13 // NEONized skia functions | |
| 14 //////////////////////////////////////////////////////////////////////////////// | |
| 15 | |
| 16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { | |
| 17 uint16x8_t tmp; | |
| 18 uint8x8_t ret; | |
| 19 | |
| 20 tmp = vmull_u8(color, alpha); | |
| 21 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); | |
| 22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); | |
| 23 | |
| 24 ret = vshrn_n_u16(tmp, 8); | |
| 25 | |
| 26 return ret; | |
| 27 } | |
| 28 | |
| 29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp ha) { | |
| 30 uint16x8_t ret; | |
| 31 | |
| 32 ret = vmull_u8(color, alpha); | |
| 33 ret = vaddq_u16(ret, vdupq_n_u16(128)); | |
| 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | |
| 35 | |
| 36 ret = vshrq_n_u16(ret, 8); | |
| 37 | |
| 38 return ret; | |
| 39 } | |
| 40 | |
| 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | |
| 42 uint16x8_t tmp; | |
| 43 | |
| 44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
| 45 vmovn_u32(vreinterpretq_u32_s32(p2))); | |
| 46 | |
| 47 tmp += vdupq_n_u16(128); | |
| 48 tmp += vshrq_n_u16(tmp, 8); | |
| 49 | |
| 50 return vshrn_n_u16(tmp, 8); | |
| 51 } | |
| 52 | |
| 53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | |
| 54 prod += vdupq_n_u16(128); | |
| 55 prod += vshrq_n_u16(prod, 8); | |
| 56 | |
| 57 return vshrq_n_u16(prod, 8); | |
| 58 } | |
| 59 | |
| 60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) { | |
| 61 uint8x8_t ret; | |
| 62 uint32x4_t cmp1, cmp2; | |
| 63 uint16x8_t cmp16; | |
| 64 uint8x8_t cmp8, cmp8_1; | |
| 65 | |
| 66 // Test if <= 0 | |
| 67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | |
| 68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | |
| 69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
| 70 cmp8_1 = vmovn_u16(cmp16); | |
| 71 | |
| 72 // Init to zero | |
| 73 ret = vdup_n_u8(0); | |
| 74 | |
| 75 // Test if >= 255*255 | |
| 76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | |
| 77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | |
| 78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
| 79 cmp8 = vmovn_u16(cmp16); | |
| 80 | |
| 81 // Insert 255 where true | |
| 82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | |
| 83 | |
| 84 // Calc SkDiv255Round | |
| 85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | |
| 86 | |
| 87 // Insert where false and previous test false | |
| 88 cmp8 = cmp8 | cmp8_1; | |
| 89 ret = vbsl_u8(cmp8, ret, div); | |
| 90 | |
| 91 // Return the final combination | |
| 92 return ret; | |
| 93 } | |
| 94 | |
| 95 //////////////////////////////////////////////////////////////////////////////// | |
| 96 // 8 pixels modeprocs | |
| 97 //////////////////////////////////////////////////////////////////////////////// | |
| 98 | |
| 99 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 100 uint8x8x4_t ret; | |
| 101 uint16x8_t src_scale; | |
| 102 | |
| 103 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
| 104 | |
| 105 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_sc ale); | |
| 106 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_sc ale); | |
| 107 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_sc ale); | |
| 108 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_sc ale); | |
| 109 | |
| 110 return ret; | |
| 111 } | |
| 112 | |
| 113 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 114 uint8x8x4_t ret; | |
| 115 uint16x8_t scale; | |
| 116 | |
| 117 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); | |
| 118 | |
| 119 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); | |
| 120 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); | |
| 121 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); | |
| 122 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); | |
| 123 | |
| 124 return ret; | |
| 125 } | |
| 126 | |
| 127 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 128 uint8x8x4_t ret; | |
| 129 uint16x8_t scale; | |
| 130 | |
| 131 scale = SkAlpha255To256_neon8(src.val[NEON_A]); | |
| 132 | |
| 133 ret = SkAlphaMulQ_neon8(dst, scale); | |
| 134 | |
| 135 return ret; | |
| 136 } | |
| 137 | |
| 138 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 139 uint8x8x4_t ret; | |
| 140 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
| 141 | |
| 142 ret = SkAlphaMulQ_neon8(src, scale); | |
| 143 | |
| 144 return ret; | |
| 145 } | |
| 146 | |
| 147 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 148 uint8x8x4_t ret; | |
| 149 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); | |
| 150 | |
| 151 ret = SkAlphaMulQ_neon8(dst, scale); | |
| 152 | |
| 153 return ret; | |
| 154 } | |
| 155 | |
| 156 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 157 uint8x8x4_t ret; | |
| 158 uint8x8_t isa; | |
| 159 | |
| 160 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
| 161 | |
| 162 ret.val[NEON_A] = dst.val[NEON_A]; | |
| 163 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) | |
| 164 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
| 165 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) | |
| 166 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
| 167 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) | |
| 168 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
| 169 | |
| 170 return ret; | |
| 171 } | |
| 172 | |
| 173 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 174 uint8x8x4_t ret; | |
| 175 uint8x8_t ida; | |
| 176 | |
| 177 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
| 178 | |
| 179 ret.val[NEON_A] = src.val[NEON_A]; | |
| 180 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
| 181 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); | |
| 182 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
| 183 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); | |
| 184 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
| 185 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); | |
| 186 | |
| 187 return ret; | |
| 188 } | |
| 189 | |
| 190 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 191 uint8x8x4_t ret; | |
| 192 uint8x8_t isa, ida; | |
| 193 uint16x8_t tmp_wide, tmp_wide2; | |
| 194 | |
| 195 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
| 196 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
| 197 | |
| 198 // First calc alpha | |
| 199 tmp_wide = vmovl_u8(src.val[NEON_A]); | |
| 200 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); | |
| 201 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A ]), 1); | |
| 202 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); | |
| 203 ret.val[NEON_A] = vmovn_u16(tmp_wide); | |
| 204 | |
| 205 // Then colors | |
| 206 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
| 207 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
| 208 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
| 209 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
| 210 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
| 211 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
| 212 | |
| 213 return ret; | |
| 214 } | |
| 215 | |
| 216 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 217 uint8x8x4_t ret; | |
| 218 | |
| 219 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); | |
| 220 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); | |
| 221 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); | |
| 222 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); | |
| 223 | |
| 224 return ret; | |
| 225 } | |
| 226 | |
| 227 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 228 uint8x8x4_t ret; | |
| 229 | |
| 230 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); | |
| 231 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); | |
| 232 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); | |
| 233 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); | |
| 234 | |
| 235 return ret; | |
| 236 } | |
| 237 | |
| 238 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { | |
| 239 uint16x8_t tmp; | |
| 240 | |
| 241 tmp = vaddl_u8(a, b); | |
| 242 tmp -= SkAlphaMulAlpha_neon8_16(a, b); | |
| 243 | |
| 244 return vmovn_u16(tmp); | |
| 245 } | |
| 246 | |
| 247 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 248 uint8x8x4_t ret; | |
| 249 | |
| 250 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 251 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); | |
| 252 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); | |
| 253 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); | |
| 254 | |
| 255 return ret; | |
| 256 } | |
| 257 | |
| 258 template <bool overlay> | |
| 259 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
| 260 uint8x8_t sa, uint8x8_t da) { | |
| 261 /* | |
| 262 * In the end we're gonna use (rc + tmp) with a different rc | |
| 263 * coming from an alternative. | |
| 264 * The whole value (rc + tmp) can always be expressed as | |
| 265 * VAL = COM - SUB in the if case | |
| 266 * VAL = COM + SUB - sa*da in the else case | |
| 267 * | |
| 268 * with COM = 255 * (sc + dc) | |
| 269 * and SUB = sc*da + dc*sa - 2*dc*sc | |
| 270 */ | |
| 271 | |
| 272 // Prepare common subexpressions | |
| 273 uint16x8_t const255 = vdupq_n_u16(255); | |
| 274 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); | |
| 275 uint16x8_t scda = vmull_u8(sc, da); | |
| 276 uint16x8_t dcsa = vmull_u8(dc, sa); | |
| 277 uint16x8_t sada = vmull_u8(sa, da); | |
| 278 | |
| 279 // Prepare non common subexpressions | |
| 280 uint16x8_t dc2, sc2; | |
| 281 uint32x4_t scdc2_1, scdc2_2; | |
| 282 if (overlay) { | |
| 283 dc2 = vshll_n_u8(dc, 1); | |
| 284 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | |
| 285 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | |
| 286 } else { | |
| 287 sc2 = vshll_n_u8(sc, 1); | |
| 288 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | |
| 289 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | |
| 290 } | |
| 291 | |
| 292 // Calc COM | |
| 293 int32x4_t com1, com2; | |
| 294 com1 = vreinterpretq_s32_u32( | |
| 295 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
| 296 com2 = vreinterpretq_s32_u32( | |
| 297 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
| 298 | |
| 299 // Calc SUB | |
| 300 int32x4_t sub1, sub2; | |
| 301 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa ))); | |
| 302 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa))); | |
| 303 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | |
| 304 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | |
| 305 | |
| 306 // Compare 2*dc <= da | |
| 307 uint16x8_t cmp; | |
| 308 | |
| 309 if (overlay) { | |
| 310 cmp = vcleq_u16(dc2, vmovl_u8(da)); | |
| 311 } else { | |
| 312 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | |
| 313 } | |
| 314 | |
| 315 // Prepare variables | |
| 316 int32x4_t val1_1, val1_2; | |
| 317 int32x4_t val2_1, val2_2; | |
| 318 uint32x4_t cmp1, cmp2; | |
| 319 | |
| 320 cmp1 = vmovl_u16(vget_low_u16(cmp)); | |
| 321 cmp1 |= vshlq_n_u32(cmp1, 16); | |
| 322 cmp2 = vmovl_u16(vget_high_u16(cmp)); | |
| 323 cmp2 |= vshlq_n_u32(cmp2, 16); | |
| 324 | |
| 325 // Calc COM - SUB | |
| 326 val1_1 = com1 - sub1; | |
| 327 val1_2 = com2 - sub2; | |
| 328 | |
| 329 // Calc COM + SUB - sa*da | |
| 330 val2_1 = com1 + sub1; | |
| 331 val2_2 = com2 + sub2; | |
| 332 | |
| 333 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada )))); | |
| 334 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a)))); | |
| 335 | |
| 336 // Insert where needed | |
| 337 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | |
| 338 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | |
| 339 | |
| 340 // Call the clamp_div255round function | |
| 341 return clamp_div255round_simd8_32(val1_1, val1_2); | |
| 342 } | |
| 343 | |
| 344 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | |
| 345 uint8x8_t sa, uint8x8_t da) { | |
| 346 return overlay_hardlight_color<true>(sc, dc, sa, da); | |
| 347 } | |
| 348 | |
| 349 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 350 uint8x8x4_t ret; | |
| 351 | |
| 352 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 353 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], | |
| 354 src.val[NEON_A], dst.val[NEON_A]); | |
| 355 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], | |
| 356 src.val[NEON_A], dst.val[NEON_A]); | |
| 357 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], | |
| 358 src.val[NEON_A], dst.val[NEON_A]); | |
| 359 | |
| 360 return ret; | |
| 361 } | |
| 362 | |
| 363 template <bool lighten> | |
| 364 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, | |
| 365 uint8x8_t sa, uint8x8_t da) { | |
| 366 uint16x8_t sd, ds, cmp, tmp, tmp2; | |
| 367 | |
| 368 // Prepare | |
| 369 sd = vmull_u8(sc, da); | |
| 370 ds = vmull_u8(dc, sa); | |
| 371 | |
| 372 // Do test | |
| 373 if (lighten) { | |
| 374 cmp = vcgtq_u16(sd, ds); | |
| 375 } else { | |
| 376 cmp = vcltq_u16(sd, ds); | |
| 377 } | |
| 378 | |
| 379 // Assign if | |
| 380 tmp = vaddl_u8(sc, dc); | |
| 381 tmp2 = tmp; | |
| 382 tmp -= SkDiv255Round_neon8_16_16(ds); | |
| 383 | |
| 384 // Calc else | |
| 385 tmp2 -= SkDiv255Round_neon8_16_16(sd); | |
| 386 | |
| 387 // Insert where needed | |
| 388 tmp = vbslq_u16(cmp, tmp, tmp2); | |
| 389 | |
| 390 return vmovn_u16(tmp); | |
| 391 } | |
| 392 | |
| 393 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, | |
| 394 uint8x8_t sa, uint8x8_t da) { | |
| 395 return lighten_darken_color<false>(sc, dc, sa, da); | |
| 396 } | |
| 397 | |
| 398 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 399 uint8x8x4_t ret; | |
| 400 | |
| 401 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 402 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], | |
| 403 src.val[NEON_A], dst.val[NEON_A]); | |
| 404 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], | |
| 405 src.val[NEON_A], dst.val[NEON_A]); | |
| 406 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], | |
| 407 src.val[NEON_A], dst.val[NEON_A]); | |
| 408 | |
| 409 return ret; | |
| 410 } | |
| 411 | |
| 412 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, | |
| 413 uint8x8_t sa, uint8x8_t da) { | |
| 414 return lighten_darken_color<true>(sc, dc, sa, da); | |
| 415 } | |
| 416 | |
| 417 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 418 uint8x8x4_t ret; | |
| 419 | |
| 420 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 421 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], | |
| 422 src.val[NEON_A], dst.val[NEON_A]); | |
| 423 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], | |
| 424 src.val[NEON_A], dst.val[NEON_A]); | |
| 425 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], | |
| 426 src.val[NEON_A], dst.val[NEON_A]); | |
| 427 | |
| 428 return ret; | |
| 429 } | |
| 430 | |
| 431 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
| 432 uint8x8_t sa, uint8x8_t da) { | |
| 433 return overlay_hardlight_color<false>(sc, dc, sa, da); | |
| 434 } | |
| 435 | |
| 436 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 437 uint8x8x4_t ret; | |
| 438 | |
| 439 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 440 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], | |
| 441 src.val[NEON_A], dst.val[NEON_A]); | |
| 442 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], | |
| 443 src.val[NEON_A], dst.val[NEON_A]); | |
| 444 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], | |
| 445 src.val[NEON_A], dst.val[NEON_A]); | |
| 446 | |
| 447 return ret; | |
| 448 } | |
| 449 | |
| 450 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, | |
| 451 uint8x8_t sa, uint8x8_t da) { | |
| 452 uint16x8_t sd, ds, tmp; | |
| 453 int16x8_t val; | |
| 454 | |
| 455 sd = vmull_u8(sc, da); | |
| 456 ds = vmull_u8(dc, sa); | |
| 457 | |
| 458 tmp = vminq_u16(sd, ds); | |
| 459 tmp = SkDiv255Round_neon8_16_16(tmp); | |
| 460 tmp = vshlq_n_u16(tmp, 1); | |
| 461 | |
| 462 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); | |
| 463 | |
| 464 val -= vreinterpretq_s16_u16(tmp); | |
| 465 | |
| 466 val = vmaxq_s16(val, vdupq_n_s16(0)); | |
| 467 val = vminq_s16(val, vdupq_n_s16(255)); | |
| 468 | |
| 469 return vmovn_u16(vreinterpretq_u16_s16(val)); | |
| 470 } | |
| 471 | |
| 472 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 473 uint8x8x4_t ret; | |
| 474 | |
| 475 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 476 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], | |
| 477 src.val[NEON_A], dst.val[NEON_A]); | |
| 478 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], | |
| 479 src.val[NEON_A], dst.val[NEON_A]); | |
| 480 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], | |
| 481 src.val[NEON_A], dst.val[NEON_A]); | |
| 482 | |
| 483 return ret; | |
| 484 } | |
| 485 | |
| 486 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, | |
| 487 uint8x8_t sa, uint8x8_t da) { | |
| 488 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ | |
| 489 | |
| 490 uint16x8_t sc_plus_dc, scdc, const255; | |
| 491 int32x4_t term1_1, term1_2, term2_1, term2_2; | |
| 492 | |
| 493 /* Calc (sc + dc) and (sc * dc) */ | |
| 494 sc_plus_dc = vaddl_u8(sc, dc); | |
| 495 scdc = vmull_u8(sc, dc); | |
| 496 | |
| 497 /* Prepare constants */ | |
| 498 const255 = vdupq_n_u16(255); | |
| 499 | |
| 500 /* Calc the first term */ | |
| 501 term1_1 = vreinterpretq_s32_u32( | |
| 502 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
| 503 term1_2 = vreinterpretq_s32_u32( | |
| 504 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
| 505 | |
| 506 /* Calc the second term */ | |
| 507 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | |
| 508 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | |
| 509 | |
| 510 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | |
| 511 } | |
| 512 | |
| 513 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 514 uint8x8x4_t ret; | |
| 515 | |
| 516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 517 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | |
| 518 src.val[NEON_A], dst.val[NEON_A]); | |
| 519 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | |
| 520 src.val[NEON_A], dst.val[NEON_A]); | |
| 521 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | |
| 522 src.val[NEON_A], dst.val[NEON_A]); | |
| 523 | |
| 524 return ret; | |
| 525 } | |
| 526 | |
| 527 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | |
| 528 uint8x8_t sa, uint8x8_t da) { | |
| 529 uint32x4_t val1, val2; | |
| 530 uint16x8_t scdc, t1, t2; | |
| 531 | |
| 532 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | |
| 533 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | |
| 534 scdc = vmull_u8(sc, dc); | |
| 535 | |
| 536 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | |
| 537 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | |
| 538 | |
| 539 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | |
| 540 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | |
| 541 | |
| 542 return clamp_div255round_simd8_32( | |
| 543 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | |
| 544 } | |
| 545 | |
| 546 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 547 uint8x8x4_t ret; | |
| 548 | |
| 549 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 550 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | |
| 551 src.val[NEON_A], dst.val[NEON_A]) ; | |
| 552 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], | |
| 553 src.val[NEON_A], dst.val[NEON_A]) ; | |
| 554 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], | |
| 555 src.val[NEON_A], dst.val[NEON_A]) ; | |
| 556 | |
| 557 return ret; | |
| 558 } | |
| 559 | |
| 560 //////////////////////////////////////////////////////////////////////////////// | |
| 561 | |
| 562 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | |
| 563 | |
| 564 class SkNEONProcCoeffXfermode : public SkProcCoeffXfermode { | |
| 565 public: | |
| 566 SkNEONProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, | |
| 567 SkXfermodeProcSIMD procSIMD) | |
| 568 : INHERITED(rec, mode), fProcSIMD(procSIMD) {} | |
| 569 | |
| 570 virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, | |
| 571 const SkAlpha aa[]) const SK_OVERRIDE; | |
| 572 | |
| 573 SK_DEVELOPER_TO_STRING() | |
| 574 SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode) | |
| 575 | |
| 576 private: | |
| 577 SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer) | |
| 578 : INHERITED(buffer) { | |
| 579 | |
| 580 fProcSIMD = NULL; | |
| 581 if (!buffer.isCrossProcess()) { | |
| 582 fProcSIMD = (SkXfermodeProcSIMD)buffer.readFunctionPtr(); | |
| 583 } | |
| 584 } | |
| 585 | |
| 586 virtual void flatten(SkFlattenableWriteBuffer& buffer) const SK_OVERRIDE; | |
| 587 | |
| 588 SkXfermodeProcSIMD fProcSIMD; | |
| 589 typedef SkProcCoeffXfermode INHERITED; | |
| 590 }; | |
| 591 | |
| 592 | |
| 593 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], | |
| 594 int count, const SkAlpha aa[]) const { | |
| 595 SkASSERT(dst && src && count >= 0); | |
| 596 | |
| 597 SkXfermodeProc proc = this->getProc(); | |
| 598 SkXfermodeProcSIMD procSIMD = fProcSIMD; | |
| 599 | |
| 600 if (NULL == aa) { | |
| 601 // Unrolled NEON code | |
| 602 while (count >= 8) { | |
| 603 uint8x8x4_t vsrc, vdst, vres; | |
| 604 | |
| 605 asm volatile ( | |
| 606 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
| 607 "vld4.u8 %h[vdst], [%[dst]] \t\n" | |
| 608 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst) | |
| 609 : [src] "r" (src), [dst] "r" (dst) | |
| 610 : | |
| 611 ); | |
| 612 | |
| 613 vres = procSIMD(vsrc, vdst); | |
| 614 | |
| 615 vst4_u8((uint8_t*)dst, vres); | |
| 616 | |
| 617 count -= 8; | |
| 618 dst += 8; | |
| 619 } | |
| 620 // Leftovers | |
| 621 for (int i = 0; i < count; i++) { | |
| 622 dst[i] = proc(src[i], dst[i]); | |
| 623 } | |
| 624 } else { | |
| 625 for (int i = count - 1; i >= 0; --i) { | |
| 626 unsigned a = aa[i]; | |
| 627 if (0 != a) { | |
| 628 SkPMColor dstC = dst[i]; | |
| 629 SkPMColor C = proc(src[i], dstC); | |
| 630 if (a != 0xFF) { | |
| 631 C = SkFourByteInterp(C, dstC, a); | |
| 632 } | |
| 633 dst[i] = C; | |
| 634 } | |
| 635 } | |
| 636 } | |
| 637 } | |
| 638 | |
| 639 #ifdef SK_DEVELOPER | |
| 640 void SkNEONProcCoeffXfermode::toString(SkString* str) const { | |
| 641 this->INHERITED::toString(str); | |
| 642 } | |
| 643 #endif | |
| 644 | |
| 645 void SkNEONProcCoeffXfermode::flatten(SkFlattenableWriteBuffer& buffer) const { | |
| 646 this->INHERITED::flatten(buffer); | |
| 647 if (!buffer.isCrossProcess()) { | |
| 648 buffer.writeFunctionPtr((void*)fProcSIMD); | |
| 649 } | |
| 650 } | |
| 651 | |
| 652 //////////////////////////////////////////////////////////////////////////////// | |
| 653 | |
| 654 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { | |
| 655 [SkXfermode::kClear_Mode] = NULL, | |
| 656 [SkXfermode::kSrc_Mode] = NULL, | |
| 657 [SkXfermode::kDst_Mode] = NULL, | |
| 658 [SkXfermode::kSrcOver_Mode] = NULL, | |
| 659 [SkXfermode::kDstOver_Mode] = dstover_modeproc_neon8, | |
| 660 [SkXfermode::kSrcIn_Mode] = srcin_modeproc_neon8, | |
| 661 [SkXfermode::kDstIn_Mode] = dstin_modeproc_neon8, | |
| 662 [SkXfermode::kSrcOut_Mode] = srcout_modeproc_neon8, | |
| 663 [SkXfermode::kDstOut_Mode] = dstout_modeproc_neon8, | |
| 664 [SkXfermode::kSrcATop_Mode] = srcatop_modeproc_neon8, | |
| 665 [SkXfermode::kDstATop_Mode] = dstatop_modeproc_neon8, | |
| 666 [SkXfermode::kXor_Mode] = xor_modeproc_neon8, | |
| 667 [SkXfermode::kPlus_Mode] = plus_modeproc_neon8, | |
| 668 [SkXfermode::kModulate_Mode]= modulate_modeproc_neon8, | |
| 669 [SkXfermode::kScreen_Mode] = screen_modeproc_neon8, | |
| 670 | |
| 671 [SkXfermode::kOverlay_Mode] = overlay_modeproc_neon8, | |
| 672 [SkXfermode::kDarken_Mode] = darken_modeproc_neon8, | |
| 673 [SkXfermode::kLighten_Mode] = lighten_modeproc_neon8, | |
| 674 [SkXfermode::kColorDodge_Mode] = NULL, | |
| 675 [SkXfermode::kColorBurn_Mode] = NULL, | |
| 676 [SkXfermode::kHardLight_Mode] = hardlight_modeproc_neon8, | |
| 677 [SkXfermode::kSoftLight_Mode] = NULL, | |
| 678 [SkXfermode::kDifference_Mode] = difference_modeproc_neon8, | |
| 679 [SkXfermode::kExclusion_Mode] = exclusion_modeproc_neon8, | |
| 680 [SkXfermode::kMultiply_Mode] = multiply_modeproc_neon8, | |
| 681 | |
| 682 [SkXfermode::kHue_Mode] = NULL, | |
| 683 [SkXfermode::kSaturation_Mode] = NULL, | |
| 684 [SkXfermode::kColor_Mode] = NULL, | |
| 685 [SkXfermode::kLuminosity_Mode] = NULL, | |
| 686 }; | |
| 687 | |
| 688 SK_COMPILE_ASSERT( | |
| 689 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, | |
| 690 mode_count_arm | |
| 691 ); | |
| 692 | |
| 693 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, | |
| 694 SkXfermode::Mode mode) { | |
| 695 #if SK_ARM_NEON_IS_DYNAMIC | |
| 696 if ((sk_cpu_arm_has_neon()) && (gNEONXfermodeProcs[mode] != NULL)) { | |
|
djsollen
2013/10/10 14:24:23
my concern with putting this here is that this fil
| |
| 697 #elif SK_ARM_NEON_IS_ALWAYS | |
| 698 if (gNEONXfermodeProcs[mode] != NULL) { | |
| 699 #endif | |
| 700 return SkNEW_ARGS(SkNEONProcCoeffXfermode, | |
| 701 (rec, mode, gNEONXfermodeProcs[mode])); | |
| 702 } | |
| 703 return NULL; | |
| 704 } | |
| OLD | NEW |