| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkXfermode.h" | 8 // Including Sk4pxXfermode.h from this file should find SK_ARM_HAS_NEON is defin
ed. |
| 9 #include "SkXfermode_proccoeff.h" | |
| 10 #include "SkColorPriv.h" | |
| 11 | |
| 12 #include <arm_neon.h> | |
| 13 #include "SkColor_opts_neon.h" | |
| 14 #include "SkXfermode_opts_arm_neon.h" | |
| 15 #include "Sk4pxXfermode.h" | 9 #include "Sk4pxXfermode.h" |
| 16 | 10 |
| 17 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 11 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& r, SkX
fermode::Mode m); |
| 18 | 12 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& r, SkX
fermode::Mode m) { |
| 19 | 13 return SkCreate4pxXfermode(r, m); |
| 20 //////////////////////////////////////////////////////////////////////////////// | |
| 21 // NEONized skia functions | |
| 22 //////////////////////////////////////////////////////////////////////////////// | |
| 23 | |
| 24 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha)
{ | |
| 25 uint16x8_t tmp; | |
| 26 uint8x8_t ret; | |
| 27 | |
| 28 tmp = vmull_u8(color, alpha); | |
| 29 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); | |
| 30 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); | |
| 31 | |
| 32 ret = vshrn_n_u16(tmp, 8); | |
| 33 | |
| 34 return ret; | |
| 35 } | 14 } |
| 36 | |
| 37 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
ha) { | |
| 38 uint16x8_t ret; | |
| 39 | |
| 40 ret = vmull_u8(color, alpha); | |
| 41 ret = vaddq_u16(ret, vdupq_n_u16(128)); | |
| 42 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | |
| 43 | |
| 44 ret = vshrq_n_u16(ret, 8); | |
| 45 | |
| 46 return ret; | |
| 47 } | |
| 48 | |
| 49 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | |
| 50 uint16x8_t tmp; | |
| 51 | |
| 52 #ifdef SK_CPU_ARM64 | |
| 53 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
| 54 vreinterpretq_u32_s32(p2)); | |
| 55 #else | |
| 56 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
| 57 vmovn_u32(vreinterpretq_u32_s32(p2))); | |
| 58 #endif | |
| 59 | |
| 60 tmp += vdupq_n_u16(128); | |
| 61 tmp += vshrq_n_u16(tmp, 8); | |
| 62 | |
| 63 return vshrn_n_u16(tmp, 8); | |
| 64 } | |
| 65 | |
| 66 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | |
| 67 prod += vdupq_n_u16(128); | |
| 68 prod += vshrq_n_u16(prod, 8); | |
| 69 | |
| 70 return vshrq_n_u16(prod, 8); | |
| 71 } | |
| 72 | |
| 73 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | |
| 74 uint8x8_t ret; | |
| 75 uint32x4_t cmp1, cmp2; | |
| 76 uint16x8_t cmp16; | |
| 77 uint8x8_t cmp8, cmp8_1; | |
| 78 | |
| 79 // Test if <= 0 | |
| 80 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | |
| 81 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | |
| 82 #ifdef SK_CPU_ARM64 | |
| 83 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
| 84 #else | |
| 85 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
| 86 #endif | |
| 87 cmp8_1 = vmovn_u16(cmp16); | |
| 88 | |
| 89 // Init to zero | |
| 90 ret = vdup_n_u8(0); | |
| 91 | |
| 92 // Test if >= 255*255 | |
| 93 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | |
| 94 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | |
| 95 #ifdef SK_CPU_ARM64 | |
| 96 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
| 97 #else | |
| 98 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | |
| 99 #endif | |
| 100 cmp8 = vmovn_u16(cmp16); | |
| 101 | |
| 102 // Insert 255 where true | |
| 103 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | |
| 104 | |
| 105 // Calc SkDiv255Round | |
| 106 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | |
| 107 | |
| 108 // Insert where false and previous test false | |
| 109 cmp8 = cmp8 | cmp8_1; | |
| 110 ret = vbsl_u8(cmp8, ret, div); | |
| 111 | |
| 112 // Return the final combination | |
| 113 return ret; | |
| 114 } | |
| 115 | |
| 116 //////////////////////////////////////////////////////////////////////////////// | |
| 117 // 1 pixel modeprocs | |
| 118 //////////////////////////////////////////////////////////////////////////////// | |
| 119 | |
| 120 // kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] | |
| 121 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
| 122 unsigned sa = SkGetPackedA32(src); | |
| 123 unsigned da = SkGetPackedA32(dst); | |
| 124 unsigned isa = 255 - sa; | |
| 125 | |
| 126 uint8x8_t vda, visa, vsrc, vdst; | |
| 127 | |
| 128 vda = vdup_n_u8(da); | |
| 129 visa = vdup_n_u8(isa); | |
| 130 | |
| 131 uint16x8_t vsrc_wide, vdst_wide; | |
| 132 vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); | |
| 133 vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); | |
| 134 | |
| 135 vsrc_wide += vdupq_n_u16(128); | |
| 136 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
| 137 | |
| 138 vdst_wide += vdupq_n_u16(128); | |
| 139 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
| 140 | |
| 141 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
| 142 vdst = vshrn_n_u16(vdst_wide, 8); | |
| 143 | |
| 144 vsrc += vdst; | |
| 145 vsrc = vset_lane_u8(da, vsrc, 3); | |
| 146 | |
| 147 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
| 148 } | |
| 149 | |
| 150 // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] | |
| 151 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
| 152 unsigned sa = SkGetPackedA32(src); | |
| 153 unsigned da = SkGetPackedA32(dst); | |
| 154 unsigned ida = 255 - da; | |
| 155 | |
| 156 uint8x8_t vsa, vida, vsrc, vdst; | |
| 157 | |
| 158 vsa = vdup_n_u8(sa); | |
| 159 vida = vdup_n_u8(ida); | |
| 160 | |
| 161 uint16x8_t vsrc_wide, vdst_wide; | |
| 162 vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); | |
| 163 vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); | |
| 164 | |
| 165 vsrc_wide += vdupq_n_u16(128); | |
| 166 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
| 167 | |
| 168 vdst_wide += vdupq_n_u16(128); | |
| 169 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
| 170 | |
| 171 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
| 172 vdst = vshrn_n_u16(vdst_wide, 8); | |
| 173 | |
| 174 vsrc += vdst; | |
| 175 vsrc = vset_lane_u8(sa, vsrc, 3); | |
| 176 | |
| 177 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
| 178 } | |
| 179 | |
| 180 // kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] | |
| 181 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
| 182 unsigned sa = SkGetPackedA32(src); | |
| 183 unsigned da = SkGetPackedA32(dst); | |
| 184 unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); | |
| 185 unsigned isa = 255 - sa; | |
| 186 unsigned ida = 255 - da; | |
| 187 | |
| 188 uint8x8_t vsrc, vdst, visa, vida; | |
| 189 uint16x8_t vsrc_wide, vdst_wide; | |
| 190 | |
| 191 visa = vdup_n_u8(isa); | |
| 192 vida = vdup_n_u8(ida); | |
| 193 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
| 194 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
| 195 | |
| 196 vsrc_wide = vmull_u8(vsrc, vida); | |
| 197 vdst_wide = vmull_u8(vdst, visa); | |
| 198 | |
| 199 vsrc_wide += vdupq_n_u16(128); | |
| 200 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); | |
| 201 | |
| 202 vdst_wide += vdupq_n_u16(128); | |
| 203 vdst_wide += vshrq_n_u16(vdst_wide, 8); | |
| 204 | |
| 205 vsrc = vshrn_n_u16(vsrc_wide, 8); | |
| 206 vdst = vshrn_n_u16(vdst_wide, 8); | |
| 207 | |
| 208 vsrc += vdst; | |
| 209 | |
| 210 vsrc = vset_lane_u8(ret_alpha, vsrc, 3); | |
| 211 | |
| 212 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
| 213 } | |
| 214 | |
| 215 // kPlus_Mode | |
| 216 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
| 217 uint8x8_t vsrc, vdst; | |
| 218 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
| 219 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
| 220 vsrc = vqadd_u8(vsrc, vdst); | |
| 221 | |
| 222 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); | |
| 223 } | |
| 224 | |
| 225 // kModulate_Mode | |
| 226 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { | |
| 227 uint8x8_t vsrc, vdst, vres; | |
| 228 uint16x8_t vres_wide; | |
| 229 | |
| 230 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); | |
| 231 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); | |
| 232 | |
| 233 vres_wide = vmull_u8(vsrc, vdst); | |
| 234 | |
| 235 vres_wide += vdupq_n_u16(128); | |
| 236 vres_wide += vshrq_n_u16(vres_wide, 8); | |
| 237 | |
| 238 vres = vshrn_n_u16(vres_wide, 8); | |
| 239 | |
| 240 return vget_lane_u32(vreinterpret_u32_u8(vres), 0); | |
| 241 } | |
| 242 | |
| 243 //////////////////////////////////////////////////////////////////////////////// | |
| 244 // 8 pixels modeprocs | |
| 245 //////////////////////////////////////////////////////////////////////////////// | |
| 246 | |
| 247 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 248 uint8x8x4_t ret; | |
| 249 uint16x8_t src_scale; | |
| 250 | |
| 251 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
| 252 | |
| 253 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_sc
ale); | |
| 254 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_sc
ale); | |
| 255 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_sc
ale); | |
| 256 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_sc
ale); | |
| 257 | |
| 258 return ret; | |
| 259 } | |
| 260 | |
| 261 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 262 uint8x8x4_t ret; | |
| 263 uint16x8_t scale; | |
| 264 | |
| 265 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); | |
| 266 | |
| 267 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); | |
| 268 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); | |
| 269 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); | |
| 270 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); | |
| 271 | |
| 272 return ret; | |
| 273 } | |
| 274 | |
| 275 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 276 uint8x8x4_t ret; | |
| 277 uint16x8_t scale; | |
| 278 | |
| 279 scale = SkAlpha255To256_neon8(src.val[NEON_A]); | |
| 280 | |
| 281 ret = SkAlphaMulQ_neon8(dst, scale); | |
| 282 | |
| 283 return ret; | |
| 284 } | |
| 285 | |
| 286 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 287 uint8x8x4_t ret; | |
| 288 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); | |
| 289 | |
| 290 ret = SkAlphaMulQ_neon8(src, scale); | |
| 291 | |
| 292 return ret; | |
| 293 } | |
| 294 | |
| 295 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 296 uint8x8x4_t ret; | |
| 297 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); | |
| 298 | |
| 299 ret = SkAlphaMulQ_neon8(dst, scale); | |
| 300 | |
| 301 return ret; | |
| 302 } | |
| 303 | |
| 304 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 305 uint8x8x4_t ret; | |
| 306 uint8x8_t isa; | |
| 307 | |
| 308 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
| 309 | |
| 310 ret.val[NEON_A] = dst.val[NEON_A]; | |
| 311 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) | |
| 312 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
| 313 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) | |
| 314 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
| 315 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) | |
| 316 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
| 317 | |
| 318 return ret; | |
| 319 } | |
| 320 | |
| 321 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 322 uint8x8x4_t ret; | |
| 323 uint8x8_t ida; | |
| 324 | |
| 325 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
| 326 | |
| 327 ret.val[NEON_A] = src.val[NEON_A]; | |
| 328 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
| 329 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); | |
| 330 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
| 331 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); | |
| 332 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
| 333 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); | |
| 334 | |
| 335 return ret; | |
| 336 } | |
| 337 | |
| 338 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 339 uint8x8x4_t ret; | |
| 340 uint8x8_t isa, ida; | |
| 341 uint16x8_t tmp_wide, tmp_wide2; | |
| 342 | |
| 343 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); | |
| 344 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); | |
| 345 | |
| 346 // First calc alpha | |
| 347 tmp_wide = vmovl_u8(src.val[NEON_A]); | |
| 348 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); | |
| 349 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A
]), 1); | |
| 350 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); | |
| 351 ret.val[NEON_A] = vmovn_u16(tmp_wide); | |
| 352 | |
| 353 // Then colors | |
| 354 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) | |
| 355 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); | |
| 356 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) | |
| 357 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); | |
| 358 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) | |
| 359 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); | |
| 360 | |
| 361 return ret; | |
| 362 } | |
| 363 | |
| 364 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 365 uint8x8x4_t ret; | |
| 366 | |
| 367 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); | |
| 368 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); | |
| 369 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); | |
| 370 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); | |
| 371 | |
| 372 return ret; | |
| 373 } | |
| 374 | |
| 375 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 376 uint8x8x4_t ret; | |
| 377 | |
| 378 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); | |
| 379 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); | |
| 380 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); | |
| 381 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); | |
| 382 | |
| 383 return ret; | |
| 384 } | |
| 385 | |
| 386 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { | |
| 387 uint16x8_t tmp; | |
| 388 | |
| 389 tmp = vaddl_u8(a, b); | |
| 390 tmp -= SkAlphaMulAlpha_neon8_16(a, b); | |
| 391 | |
| 392 return vmovn_u16(tmp); | |
| 393 } | |
| 394 | |
| 395 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 396 uint8x8x4_t ret; | |
| 397 | |
| 398 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 399 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); | |
| 400 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); | |
| 401 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); | |
| 402 | |
| 403 return ret; | |
| 404 } | |
| 405 | |
| 406 template <bool overlay> | |
| 407 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
| 408 uint8x8_t sa, uint8x8_t da) { | |
| 409 /* | |
| 410 * In the end we're gonna use (rc + tmp) with a different rc | |
| 411 * coming from an alternative. | |
| 412 * The whole value (rc + tmp) can always be expressed as | |
| 413 * VAL = COM - SUB in the if case | |
| 414 * VAL = COM + SUB - sa*da in the else case | |
| 415 * | |
| 416 * with COM = 255 * (sc + dc) | |
| 417 * and SUB = sc*da + dc*sa - 2*dc*sc | |
| 418 */ | |
| 419 | |
| 420 // Prepare common subexpressions | |
| 421 uint16x8_t const255 = vdupq_n_u16(255); | |
| 422 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); | |
| 423 uint16x8_t scda = vmull_u8(sc, da); | |
| 424 uint16x8_t dcsa = vmull_u8(dc, sa); | |
| 425 uint16x8_t sada = vmull_u8(sa, da); | |
| 426 | |
| 427 // Prepare non common subexpressions | |
| 428 uint16x8_t dc2, sc2; | |
| 429 uint32x4_t scdc2_1, scdc2_2; | |
| 430 if (overlay) { | |
| 431 dc2 = vshll_n_u8(dc, 1); | |
| 432 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | |
| 433 #ifdef SK_CPU_ARM64 | |
| 434 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); | |
| 435 #else | |
| 436 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | |
| 437 #endif | |
| 438 } else { | |
| 439 sc2 = vshll_n_u8(sc, 1); | |
| 440 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | |
| 441 #ifdef SK_CPU_ARM64 | |
| 442 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); | |
| 443 #else | |
| 444 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | |
| 445 #endif | |
| 446 } | |
| 447 | |
| 448 // Calc COM | |
| 449 int32x4_t com1, com2; | |
| 450 com1 = vreinterpretq_s32_u32( | |
| 451 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
| 452 com2 = vreinterpretq_s32_u32( | |
| 453 #ifdef SK_CPU_ARM64 | |
| 454 vmull_high_u16(const255, sc_plus_dc)); | |
| 455 #else | |
| 456 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
| 457 #endif | |
| 458 | |
| 459 // Calc SUB | |
| 460 int32x4_t sub1, sub2; | |
| 461 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | |
| 462 #ifdef SK_CPU_ARM64 | |
| 463 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); | |
| 464 #else | |
| 465 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | |
| 466 #endif | |
| 467 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | |
| 468 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | |
| 469 | |
| 470 // Compare 2*dc <= da | |
| 471 uint16x8_t cmp; | |
| 472 | |
| 473 if (overlay) { | |
| 474 cmp = vcleq_u16(dc2, vmovl_u8(da)); | |
| 475 } else { | |
| 476 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | |
| 477 } | |
| 478 | |
| 479 // Prepare variables | |
| 480 int32x4_t val1_1, val1_2; | |
| 481 int32x4_t val2_1, val2_2; | |
| 482 uint32x4_t cmp1, cmp2; | |
| 483 | |
| 484 // Doing a signed lengthening allows to save a few instructions | |
| 485 // thanks to sign extension. | |
| 486 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); | |
| 487 #ifdef SK_CPU_ARM64 | |
| 488 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); | |
| 489 #else | |
| 490 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); | |
| 491 #endif | |
| 492 | |
| 493 // Calc COM - SUB | |
| 494 val1_1 = com1 - sub1; | |
| 495 val1_2 = com2 - sub2; | |
| 496 | |
| 497 // Calc COM + SUB - sa*da | |
| 498 val2_1 = com1 + sub1; | |
| 499 val2_2 = com2 + sub2; | |
| 500 | |
| 501 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | |
| 502 #ifdef SK_CPU_ARM64 | |
| 503 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); | |
| 504 #else | |
| 505 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | |
| 506 #endif | |
| 507 | |
| 508 // Insert where needed | |
| 509 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | |
| 510 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | |
| 511 | |
| 512 // Call the clamp_div255round function | |
| 513 return clamp_div255round_simd8_32(val1_1, val1_2); | |
| 514 } | |
| 515 | |
| 516 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | |
| 517 uint8x8_t sa, uint8x8_t da) { | |
| 518 return overlay_hardlight_color<true>(sc, dc, sa, da); | |
| 519 } | |
| 520 | |
| 521 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 522 uint8x8x4_t ret; | |
| 523 | |
| 524 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 525 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], | |
| 526 src.val[NEON_A], dst.val[NEON_A]); | |
| 527 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], | |
| 528 src.val[NEON_A], dst.val[NEON_A]); | |
| 529 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], | |
| 530 src.val[NEON_A], dst.val[NEON_A]); | |
| 531 | |
| 532 return ret; | |
| 533 } | |
| 534 | |
| 535 template <bool lighten> | |
| 536 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, | |
| 537 uint8x8_t sa, uint8x8_t da) { | |
| 538 uint16x8_t sd, ds, cmp, tmp, tmp2; | |
| 539 | |
| 540 // Prepare | |
| 541 sd = vmull_u8(sc, da); | |
| 542 ds = vmull_u8(dc, sa); | |
| 543 | |
| 544 // Do test | |
| 545 if (lighten) { | |
| 546 cmp = vcgtq_u16(sd, ds); | |
| 547 } else { | |
| 548 cmp = vcltq_u16(sd, ds); | |
| 549 } | |
| 550 | |
| 551 // Assign if | |
| 552 tmp = vaddl_u8(sc, dc); | |
| 553 tmp2 = tmp; | |
| 554 tmp -= SkDiv255Round_neon8_16_16(ds); | |
| 555 | |
| 556 // Calc else | |
| 557 tmp2 -= SkDiv255Round_neon8_16_16(sd); | |
| 558 | |
| 559 // Insert where needed | |
| 560 tmp = vbslq_u16(cmp, tmp, tmp2); | |
| 561 | |
| 562 return vmovn_u16(tmp); | |
| 563 } | |
| 564 | |
| 565 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, | |
| 566 uint8x8_t sa, uint8x8_t da) { | |
| 567 return lighten_darken_color<false>(sc, dc, sa, da); | |
| 568 } | |
| 569 | |
| 570 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 571 uint8x8x4_t ret; | |
| 572 | |
| 573 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 574 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], | |
| 575 src.val[NEON_A], dst.val[NEON_A]); | |
| 576 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], | |
| 577 src.val[NEON_A], dst.val[NEON_A]); | |
| 578 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], | |
| 579 src.val[NEON_A], dst.val[NEON_A]); | |
| 580 | |
| 581 return ret; | |
| 582 } | |
| 583 | |
| 584 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, | |
| 585 uint8x8_t sa, uint8x8_t da) { | |
| 586 return lighten_darken_color<true>(sc, dc, sa, da); | |
| 587 } | |
| 588 | |
| 589 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 590 uint8x8x4_t ret; | |
| 591 | |
| 592 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 593 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], | |
| 594 src.val[NEON_A], dst.val[NEON_A]); | |
| 595 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], | |
| 596 src.val[NEON_A], dst.val[NEON_A]); | |
| 597 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], | |
| 598 src.val[NEON_A], dst.val[NEON_A]); | |
| 599 | |
| 600 return ret; | |
| 601 } | |
| 602 | |
| 603 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, | |
| 604 uint8x8_t sa, uint8x8_t da) { | |
| 605 return overlay_hardlight_color<false>(sc, dc, sa, da); | |
| 606 } | |
| 607 | |
| 608 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 609 uint8x8x4_t ret; | |
| 610 | |
| 611 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 612 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], | |
| 613 src.val[NEON_A], dst.val[NEON_A]); | |
| 614 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], | |
| 615 src.val[NEON_A], dst.val[NEON_A]); | |
| 616 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], | |
| 617 src.val[NEON_A], dst.val[NEON_A]); | |
| 618 | |
| 619 return ret; | |
| 620 } | |
| 621 | |
| 622 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, | |
| 623 uint8x8_t sa, uint8x8_t da) { | |
| 624 uint16x8_t sd, ds, tmp; | |
| 625 int16x8_t val; | |
| 626 | |
| 627 sd = vmull_u8(sc, da); | |
| 628 ds = vmull_u8(dc, sa); | |
| 629 | |
| 630 tmp = vminq_u16(sd, ds); | |
| 631 tmp = SkDiv255Round_neon8_16_16(tmp); | |
| 632 tmp = vshlq_n_u16(tmp, 1); | |
| 633 | |
| 634 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); | |
| 635 | |
| 636 val -= vreinterpretq_s16_u16(tmp); | |
| 637 | |
| 638 val = vmaxq_s16(val, vdupq_n_s16(0)); | |
| 639 val = vminq_s16(val, vdupq_n_s16(255)); | |
| 640 | |
| 641 return vmovn_u16(vreinterpretq_u16_s16(val)); | |
| 642 } | |
| 643 | |
| 644 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 645 uint8x8x4_t ret; | |
| 646 | |
| 647 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 648 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], | |
| 649 src.val[NEON_A], dst.val[NEON_A]); | |
| 650 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], | |
| 651 src.val[NEON_A], dst.val[NEON_A]); | |
| 652 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], | |
| 653 src.val[NEON_A], dst.val[NEON_A]); | |
| 654 | |
| 655 return ret; | |
| 656 } | |
| 657 | |
| 658 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, | |
| 659 uint8x8_t sa, uint8x8_t da) { | |
| 660 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ | |
| 661 | |
| 662 uint16x8_t sc_plus_dc, scdc, const255; | |
| 663 int32x4_t term1_1, term1_2, term2_1, term2_2; | |
| 664 | |
| 665 /* Calc (sc + dc) and (sc * dc) */ | |
| 666 sc_plus_dc = vaddl_u8(sc, dc); | |
| 667 scdc = vmull_u8(sc, dc); | |
| 668 | |
| 669 /* Prepare constants */ | |
| 670 const255 = vdupq_n_u16(255); | |
| 671 | |
| 672 /* Calc the first term */ | |
| 673 term1_1 = vreinterpretq_s32_u32( | |
| 674 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | |
| 675 term1_2 = vreinterpretq_s32_u32( | |
| 676 #ifdef SK_CPU_ARM64 | |
| 677 vmull_high_u16(const255, sc_plus_dc)); | |
| 678 #else | |
| 679 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | |
| 680 #endif | |
| 681 | |
| 682 /* Calc the second term */ | |
| 683 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | |
| 684 #ifdef SK_CPU_ARM64 | |
| 685 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); | |
| 686 #else | |
| 687 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | |
| 688 #endif | |
| 689 | |
| 690 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | |
| 691 } | |
| 692 | |
| 693 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 694 uint8x8x4_t ret; | |
| 695 | |
| 696 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 697 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | |
| 698 src.val[NEON_A], dst.val[NEON_A]); | |
| 699 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | |
| 700 src.val[NEON_A], dst.val[NEON_A]); | |
| 701 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | |
| 702 src.val[NEON_A], dst.val[NEON_A]); | |
| 703 | |
| 704 return ret; | |
| 705 } | |
| 706 | |
| 707 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | |
| 708 uint8x8_t sa, uint8x8_t da) { | |
| 709 uint32x4_t val1, val2; | |
| 710 uint16x8_t scdc, t1, t2; | |
| 711 | |
| 712 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | |
| 713 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | |
| 714 scdc = vmull_u8(sc, dc); | |
| 715 | |
| 716 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | |
| 717 #ifdef SK_CPU_ARM64 | |
| 718 val2 = vaddl_high_u16(t1, t2); | |
| 719 #else | |
| 720 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | |
| 721 #endif | |
| 722 | |
| 723 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | |
| 724 #ifdef SK_CPU_ARM64 | |
| 725 val2 = vaddw_high_u16(val2, scdc); | |
| 726 #else | |
| 727 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | |
| 728 #endif | |
| 729 | |
| 730 return clamp_div255round_simd8_32( | |
| 731 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | |
| 732 } | |
| 733 | |
| 734 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | |
| 735 uint8x8x4_t ret; | |
| 736 | |
| 737 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | |
| 738 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | |
| 739 src.val[NEON_A], dst.val[NEON_A])
; | |
| 740 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], | |
| 741 src.val[NEON_A], dst.val[NEON_A])
; | |
| 742 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], | |
| 743 src.val[NEON_A], dst.val[NEON_A])
; | |
| 744 | |
| 745 return ret; | |
| 746 } | |
| 747 | |
| 748 //////////////////////////////////////////////////////////////////////////////// | |
| 749 | |
| 750 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | |
| 751 | |
| 752 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; | |
| 753 | |
| 754 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, | |
| 755 const SkPMColor* SK_RESTRICT src, int count
, | |
| 756 const SkAlpha* SK_RESTRICT aa) const { | |
| 757 SkASSERT(dst && src && count >= 0); | |
| 758 | |
| 759 SkXfermodeProc proc = this->getProc(); | |
| 760 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | |
| 761 SkASSERT(procSIMD != NULL); | |
| 762 | |
| 763 if (NULL == aa) { | |
| 764 // Unrolled NEON code | |
| 765 // We'd like to just do this (modulo a few casts): | |
| 766 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); | |
| 767 // src += 8; | |
| 768 // dst += 8; | |
| 769 // but that tends to generate miserable code. Here are a bunch of faster | |
| 770 // workarounds for different architectures and compilers. | |
| 771 while (count >= 8) { | |
| 772 | |
| 773 #ifdef SK_CPU_ARM32 | |
| 774 uint8x8x4_t vsrc, vdst, vres; | |
| 775 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
| 776 asm volatile ( | |
| 777 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
| 778 "vld4.u8 %h[vdst], [%[dst]] \t\n" | |
| 779 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | |
| 780 : [dst] "r" (dst) | |
| 781 : | |
| 782 ); | |
| 783 #else | |
| 784 register uint8x8_t d0 asm("d0"); | |
| 785 register uint8x8_t d1 asm("d1"); | |
| 786 register uint8x8_t d2 asm("d2"); | |
| 787 register uint8x8_t d3 asm("d3"); | |
| 788 register uint8x8_t d4 asm("d4"); | |
| 789 register uint8x8_t d5 asm("d5"); | |
| 790 register uint8x8_t d6 asm("d6"); | |
| 791 register uint8x8_t d7 asm("d7"); | |
| 792 | |
| 793 asm volatile ( | |
| 794 "vld4.u8 {d0-d3},[%[src]]!;" | |
| 795 "vld4.u8 {d4-d7},[%[dst]];" | |
| 796 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | |
| 797 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | |
| 798 [src] "+&r" (src) | |
| 799 : [dst] "r" (dst) | |
| 800 : | |
| 801 ); | |
| 802 vsrc.val[0] = d0; vdst.val[0] = d4; | |
| 803 vsrc.val[1] = d1; vdst.val[1] = d5; | |
| 804 vsrc.val[2] = d2; vdst.val[2] = d6; | |
| 805 vsrc.val[3] = d3; vdst.val[3] = d7; | |
| 806 #endif | |
| 807 | |
| 808 vres = procSIMD(vsrc, vdst); | |
| 809 | |
| 810 vst4_u8((uint8_t*)dst, vres); | |
| 811 | |
| 812 dst += 8; | |
| 813 | |
| 814 #else // #ifdef SK_CPU_ARM32 | |
| 815 | |
| 816 asm volatile ( | |
| 817 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
| 818 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" | |
| 819 "blr %[proc] \t\n" | |
| 820 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" | |
| 821 : [src] "+&r" (src), [dst] "+&r" (dst) | |
| 822 : [proc] "r" (procSIMD) | |
| 823 : "cc", "memory", | |
| 824 /* We don't know what proc is going to clobber so we must | |
| 825 * add everything that is not callee-saved. | |
| 826 */ | |
| 827 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", | |
| 828 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", | |
| 829 "x30", /* x30 implicitly clobbered by blr */ | |
| 830 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | |
| 831 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", | |
| 832 "v27", "v28", "v29", "v30", "v31" | |
| 833 ); | |
| 834 | |
| 835 #endif // #ifdef SK_CPU_ARM32 | |
| 836 | |
| 837 count -= 8; | |
| 838 } | |
| 839 // Leftovers | |
| 840 for (int i = 0; i < count; i++) { | |
| 841 dst[i] = proc(src[i], dst[i]); | |
| 842 } | |
| 843 } else { | |
| 844 for (int i = count - 1; i >= 0; --i) { | |
| 845 unsigned a = aa[i]; | |
| 846 if (0 != a) { | |
| 847 SkPMColor dstC = dst[i]; | |
| 848 SkPMColor C = proc(src[i], dstC); | |
| 849 if (a != 0xFF) { | |
| 850 C = SkFourByteInterp_neon(C, dstC, a); | |
| 851 } | |
| 852 dst[i] = C; | |
| 853 } | |
| 854 } | |
| 855 } | |
| 856 } | |
| 857 | |
| 858 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, | |
| 859 const SkPMColor* SK_RESTRICT src, int count
, | |
| 860 const SkAlpha* SK_RESTRICT aa) const { | |
| 861 SkASSERT(dst && src && count >= 0); | |
| 862 | |
| 863 SkXfermodeProc proc = this->getProc(); | |
| 864 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | |
| 865 SkASSERT(procSIMD != NULL); | |
| 866 | |
| 867 if (NULL == aa) { | |
| 868 while(count >= 8) { | |
| 869 uint16x8_t vdst, vres16; | |
| 870 uint8x8x4_t vdst32, vsrc, vres; | |
| 871 | |
| 872 vdst = vld1q_u16(dst); | |
| 873 | |
| 874 #ifdef SK_CPU_ARM64 | |
| 875 vsrc = vld4_u8((uint8_t*)src); | |
| 876 #else | |
| 877 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
| 878 asm volatile ( | |
| 879 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | |
| 880 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | |
| 881 : : | |
| 882 ); | |
| 883 #else | |
| 884 register uint8x8_t d0 asm("d0"); | |
| 885 register uint8x8_t d1 asm("d1"); | |
| 886 register uint8x8_t d2 asm("d2"); | |
| 887 register uint8x8_t d3 asm("d3"); | |
| 888 | |
| 889 asm volatile ( | |
| 890 "vld4.u8 {d0-d3},[%[src]]!;" | |
| 891 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | |
| 892 [src] "+&r" (src) | |
| 893 : : | |
| 894 ); | |
| 895 vsrc.val[0] = d0; | |
| 896 vsrc.val[1] = d1; | |
| 897 vsrc.val[2] = d2; | |
| 898 vsrc.val[3] = d3; | |
| 899 #endif | |
| 900 #endif // #ifdef SK_CPU_ARM64 | |
| 901 | |
| 902 vdst32 = SkPixel16ToPixel32_neon8(vdst); | |
| 903 vres = procSIMD(vsrc, vdst32); | |
| 904 vres16 = SkPixel32ToPixel16_neon8(vres); | |
| 905 | |
| 906 vst1q_u16(dst, vres16); | |
| 907 | |
| 908 count -= 8; | |
| 909 dst += 8; | |
| 910 #ifdef SK_CPU_ARM64 | |
| 911 src += 8; | |
| 912 #endif | |
| 913 } | |
| 914 for (int i = 0; i < count; i++) { | |
| 915 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | |
| 916 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | |
| 917 } | |
| 918 } else { | |
| 919 for (int i = count - 1; i >= 0; --i) { | |
| 920 unsigned a = aa[i]; | |
| 921 if (0 != a) { | |
| 922 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | |
| 923 SkPMColor C = proc(src[i], dstC); | |
| 924 if (0xFF != a) { | |
| 925 C = SkFourByteInterp_neon(C, dstC, a); | |
| 926 } | |
| 927 dst[i] = SkPixel32ToPixel16_ToU16(C); | |
| 928 } | |
| 929 } | |
| 930 } | |
| 931 } | |
| 932 | |
| 933 #ifndef SK_IGNORE_TO_STRING | |
| 934 void SkNEONProcCoeffXfermode::toString(SkString* str) const { | |
| 935 this->INHERITED::toString(str); | |
| 936 } | |
| 937 #endif | |
| 938 | |
| 939 //////////////////////////////////////////////////////////////////////////////// | |
| 940 | |
| 941 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { | |
| 942 NULL, // kClear_Mode | |
| 943 NULL, // kSrc_Mode | |
| 944 NULL, // kDst_Mode | |
| 945 NULL, // kSrcOver_Mode | |
| 946 dstover_modeproc_neon8, | |
| 947 srcin_modeproc_neon8, | |
| 948 dstin_modeproc_neon8, | |
| 949 srcout_modeproc_neon8, | |
| 950 dstout_modeproc_neon8, | |
| 951 srcatop_modeproc_neon8, | |
| 952 dstatop_modeproc_neon8, | |
| 953 xor_modeproc_neon8, | |
| 954 plus_modeproc_neon8, | |
| 955 modulate_modeproc_neon8, | |
| 956 screen_modeproc_neon8, | |
| 957 | |
| 958 overlay_modeproc_neon8, | |
| 959 darken_modeproc_neon8, | |
| 960 lighten_modeproc_neon8, | |
| 961 NULL, // kColorDodge_Mode | |
| 962 NULL, // kColorBurn_Mode | |
| 963 hardlight_modeproc_neon8, | |
| 964 NULL, // kSoftLight_Mode | |
| 965 difference_modeproc_neon8, | |
| 966 exclusion_modeproc_neon8, | |
| 967 multiply_modeproc_neon8, | |
| 968 | |
| 969 NULL, // kHue_Mode | |
| 970 NULL, // kSaturation_Mode | |
| 971 NULL, // kColor_Mode | |
| 972 NULL, // kLuminosity_Mode | |
| 973 }; | |
| 974 | |
| 975 SK_COMPILE_ASSERT( | |
| 976 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, | |
| 977 mode_count_arm | |
| 978 ); | |
| 979 | |
| 980 SkXfermodeProc gNEONXfermodeProcs1[] = { | |
| 981 NULL, // kClear_Mode | |
| 982 NULL, // kSrc_Mode | |
| 983 NULL, // kDst_Mode | |
| 984 NULL, // kSrcOver_Mode | |
| 985 NULL, // kDstOver_Mode | |
| 986 NULL, // kSrcIn_Mode | |
| 987 NULL, // kDstIn_Mode | |
| 988 NULL, // kSrcOut_Mode | |
| 989 NULL, // kDstOut_Mode | |
| 990 srcatop_modeproc_neon, | |
| 991 dstatop_modeproc_neon, | |
| 992 xor_modeproc_neon, | |
| 993 plus_modeproc_neon, | |
| 994 modulate_modeproc_neon, | |
| 995 NULL, // kScreen_Mode | |
| 996 | |
| 997 NULL, // kOverlay_Mode | |
| 998 NULL, // kDarken_Mode | |
| 999 NULL, // kLighten_Mode | |
| 1000 NULL, // kColorDodge_Mode | |
| 1001 NULL, // kColorBurn_Mode | |
| 1002 NULL, // kHardLight_Mode | |
| 1003 NULL, // kSoftLight_Mode | |
| 1004 NULL, // kDifference_Mode | |
| 1005 NULL, // kExclusion_Mode | |
| 1006 NULL, // kMultiply_Mode | |
| 1007 | |
| 1008 NULL, // kHue_Mode | |
| 1009 NULL, // kSaturation_Mode | |
| 1010 NULL, // kColor_Mode | |
| 1011 NULL, // kLuminosity_Mode | |
| 1012 }; | |
| 1013 | |
| 1014 SK_COMPILE_ASSERT( | |
| 1015 SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, | |
| 1016 mode1_count_arm | |
| 1017 ); | |
| 1018 | |
| 1019 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, | |
| 1020 SkXfermode::Mode mode)
{ | |
| 1021 if (auto xfermode = SkCreate4pxXfermode(rec, mode)) { | |
| 1022 return xfermode; | |
| 1023 } | |
| 1024 // TODO: Sk4pxXfermode now covers every mode found in this file. Delete the
m all! | |
| 1025 if (auto proc = gNEONXfermodeProcs[mode]) { | |
| 1026 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, (void*)proc)); | |
| 1027 } | |
| 1028 return NULL; | |
| 1029 } | |
| 1030 | |
| 1031 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | |
| 1032 return gNEONXfermodeProcs1[mode]; | |
| 1033 } | |
| OLD | NEW |