| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 223 "21: \n\t" | 223 "21: \n\t" |
| 224 : [count] "+r" (count) | 224 : [count] "+r" (count) |
| 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s
rc) | 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s
rc) |
| 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6"
,"d7", | 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6"
,"d7", |
| 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25
","d26","d27","d28","d29", | 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25
","d26","d27","d28","d29", |
| 228 "d30","d31" | 228 "d30","d31" |
| 229 ); | 229 ); |
| 230 } | 230 } |
| 231 } | 231 } |
| 232 | 232 |
| 233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
| 234 prod += vdupq_n_u16(128); |
| 235 prod += vshrq_n_u16(prod, 8); |
| 236 return vshrq_n_u16(prod, 8); |
| 237 } |
| 238 |
| 233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| 234 const SkPMColor* SK_RESTRICT src, int count, | 240 const SkPMColor* SK_RESTRICT src, int count, |
| 235 U8CPU alpha, int /*x*/, int /*y*/) { | 241 U8CPU alpha, int /*x*/, int /*y*/) { |
| 242 SkASSERT(255 > alpha); |
| 236 | 243 |
| 237 U8CPU alpha_for_asm = alpha; | 244 /* This code implements a Neon version of S32A_D565_Blend. The results have |
| 238 | 245 * a few mismatches compared to the original code. These mismatches never |
| 239 asm volatile ( | 246 * exceed 1. |
| 240 /* This code implements a Neon version of S32A_D565_Blend. The output differ
s from | |
| 241 * the original in two respects: | |
| 242 * 1. The results have a few mismatches compared to the original code. Thes
e mismatches | |
| 243 * never exceed 1. It's possible to improve accuracy vs. a floating poin
t | |
| 244 * implementation by introducing rounding right shifts (vrshr) for the f
inal stage. | |
| 245 * Rounding is not present in the code below, because although results w
ould be closer | |
| 246 * to a floating point implementation, the number of mismatches compared
to the | |
| 247 * original code would be far greater. | |
| 248 * 2. On certain inputs, the original code can overflow, causing colour cha
nnels to | |
| 249 * mix. Although the Neon code can also overflow, it doesn't allow one c
olour channel | |
| 250 * to affect another. | |
| 251 */ | 247 */ |
| 252 | 248 |
| 253 #if 1 | 249 if (count >= 8) { |
| 254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ | 250 uint16x8_t valpha_max, vmask_blue; |
| 255 "add %[alpha], %[alpha], #1 \n\t" // adjust r
ange of alpha 0-256 | 251 uint8x8_t valpha; |
| 252 |
| 253 // prepare constants |
| 254 valpha_max = vmovq_n_u16(255); |
| 255 valpha = vdup_n_u8(alpha); |
| 256 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
| 257 |
| 258 do { |
| 259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 260 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
| 261 uint8x8x4_t vsrc; |
| 262 |
| 263 // load pixels |
| 264 vdst = vld1q_u16(dst); |
| 265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 266 asm ( |
| 267 "vld4.u8 %h[vsrc], [%[src]]!" |
| 268 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 269 : : |
| 270 ); |
| 256 #else | 271 #else |
| 257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" //
adjust range of alpha 0-256 | 272 register uint8x8_t d0 asm("d0"); |
| 258 #endif | 273 register uint8x8_t d1 asm("d1"); |
| 259 "vmov.u16 q3, #255 \n\t" // set up
constant | 274 register uint8x8_t d2 asm("d2"); |
| 260 "movs r4, %[count], lsr #3 \n\t" // calc. c
ount>>3 | 275 register uint8x8_t d3 asm("d3"); |
| 261 "vmov.u16 d2[0], %[alpha] \n\t" // move al
pha to Neon | |
| 262 "beq 2f \n\t" // if coun
t8 == 0, exit | |
| 263 "vmov.u16 q15, #0x1f \n\t" // set up
blue mask | |
| 264 | 276 |
| 265 "1: \n\t" | 277 asm volatile ( |
| 266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load ei
ght dst RGB565 pixels | 278 "vld4.u8 {d0-d3},[%[src]]!;" |
| 267 "subs r4, r4, #1 \n\t" // decreme
nt loop counter | 279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
| 268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load ei
ght src ABGR32 pixels | 280 [src] "+&r" (src) |
| 269 // and deinterleave | 281 : : |
| 270 | 282 ); |
| 271 "vshl.u16 q9, q0, #5 \n\t" // shift g
reen to top of lanes | 283 vsrc.val[0] = d0; |
| 272 "vand q10, q0, q15 \n\t" // extract
blue | 284 vsrc.val[1] = d1; |
| 273 "vshr.u16 q8, q0, #11 \n\t" // extract
red | 285 vsrc.val[2] = d2; |
| 274 "vshr.u16 q9, q9, #10 \n\t" // extract
green | 286 vsrc.val[3] = d3; |
| 275 // dstrgb = {q8, q9, q10} | |
| 276 | |
| 277 "vshr.u8 d24, d24, #3 \n\t" // shift r
ed to 565 range | |
| 278 "vshr.u8 d25, d25, #2 \n\t" // shift g
reen to 565 range | |
| 279 "vshr.u8 d26, d26, #3 \n\t" // shift b
lue to 565 range | |
| 280 | |
| 281 "vmovl.u8 q11, d24 \n\t" // widen r
ed to 16 bits | |
| 282 "vmovl.u8 q12, d25 \n\t" // widen g
reen to 16 bits | |
| 283 "vmovl.u8 q14, d27 \n\t" // widen a
lpha to 16 bits | |
| 284 "vmovl.u8 q13, d26 \n\t" // widen b
lue to 16 bits | |
| 285 // srcrgba = {q11, q12, q13, q14} | |
| 286 | |
| 287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * sr
c_scale | |
| 288 "vmul.u16 q11, q11, d2[0] \n\t" // red res
ult = src_red * src_scale | |
| 289 "vmul.u16 q12, q12, d2[0] \n\t" // grn res
ult = src_grn * src_scale | |
| 290 "vmul.u16 q13, q13, d2[0] \n\t" // blu res
ult = src_blu * src_scale | |
| 291 | |
| 292 "vshr.u16 q2, q2, #8 \n\t" // sa * sr
c_scale >> 8 | |
| 293 "vsub.u16 q2, q3, q2 \n\t" // 255 - (
sa * src_scale >> 8) | |
| 294 // dst_scale = q2 | |
| 295 | |
| 296 "vmla.u16 q11, q8, q2 \n\t" // red res
ult += dst_red * dst_scale | |
| 297 "vmla.u16 q12, q9, q2 \n\t" // grn res
ult += dst_grn * dst_scale | |
| 298 "vmla.u16 q13, q10, q2 \n\t" // blu res
ult += dst_blu * dst_scale | |
| 299 | |
| 300 #if 1 | |
| 301 // trying for a better match with SkDiv255Round(a) | |
| 302 // C alg is: a+=128; (a+a>>8)>>8 | |
| 303 // we'll use just a rounding shift [q2 is available for scratch] | |
| 304 "vrshr.u16 q11, q11, #8 \n\t" // shift
down red | |
| 305 "vrshr.u16 q12, q12, #8 \n\t" // shift
down green | |
| 306 "vrshr.u16 q13, q13, #8 \n\t" // shift
down blue | |
| 307 #else | |
| 308 // arm's original "truncating divide by 256" | |
| 309 "vshr.u16 q11, q11, #8 \n\t" // shift d
own red | |
| 310 "vshr.u16 q12, q12, #8 \n\t" // shift d
own green | |
| 311 "vshr.u16 q13, q13, #8 \n\t" // shift d
own blue | |
| 312 #endif | 287 #endif |
| 313 | 288 |
| 314 "vsli.u16 q13, q12, #5 \n\t" // insert
green into blue | |
| 315 "vsli.u16 q13, q11, #11 \n\t" // insert
red into green/blue | |
| 316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write p
ixel back to dst, update ptr | |
| 317 | 289 |
| 318 "bne 1b \n\t" // if coun
ter != 0, loop | 290 // deinterleave dst |
| 319 "2: \n\t" // exi
t | 291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes |
| 292 vdst_b = vdst & vmask_blue; // extract blue |
| 293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
| 294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green |
| 320 | 295 |
| 321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [a
lpha] "+r" (alpha_for_asm) | 296 // shift src to 565 |
| 322 : | 297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
| 323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d
6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
"d26", "d27", "d28", "d29", "d30", "d31" | 298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); |
| 324 ); | 299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); |
| 325 | 300 |
| 326 count &= 7; | 301 // calc src * src_scale |
| 327 if (count > 0) { | 302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); |
| 328 do { | 303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); |
| 329 SkPMColor sc = *src++; | 304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); |
| 330 if (sc) { | 305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); |
| 331 uint16_t dc = *dst; | 306 |
| 332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc),
alpha); | 307 // prepare dst_scale |
| 333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(Sk
GetPackedR16(dc), dst_scale); | 308 vres_a = SkDiv255Round_neon8(vres_a); |
| 334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(Sk
GetPackedG16(dc), dst_scale); | 309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 |
| 335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(Sk
GetPackedB16(dc), dst_scale); | 310 |
| 336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv25
5Round(db)); | 311 // add dst * dst_scale to previous result |
| 337 } | 312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); |
| 338 dst += 1; | 313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); |
| 339 } while (--count != 0); | 314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); |
| 315 |
| 316 #ifdef S32A_D565_BLEND_EXACT |
| 317 // It is possible to get exact results with this but it is slow, |
| 318 // even slower than C code in some cases |
| 319 vres_r = SkDiv255Round_neon8(vres_r); |
| 320 vres_g = SkDiv255Round_neon8(vres_g); |
| 321 vres_b = SkDiv255Round_neon8(vres_b); |
| 322 #else |
| 323 vres_r = vrshrq_n_u16(vres_r, 8); |
| 324 vres_g = vrshrq_n_u16(vres_g, 8); |
| 325 vres_b = vrshrq_n_u16(vres_b, 8); |
| 326 #endif |
| 327 // pack result |
| 328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green
into blue |
| 329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red in
to green/blue |
| 330 |
| 331 // store |
| 332 vst1q_u16(dst, vres_b); |
| 333 dst += 8; |
| 334 count -= 8; |
| 335 } while (count >= 8); |
| 336 } |
| 337 |
| 338 // leftovers |
| 339 while (count-- > 0) { |
| 340 SkPMColor sc = *src++; |
| 341 if (sc) { |
| 342 uint16_t dc = *dst; |
| 343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alph
a); |
| 344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetP
ackedR16(dc), dst_scale); |
| 345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetP
ackedG16(dc), dst_scale); |
| 346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetP
ackedB16(dc), dst_scale); |
| 347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Rou
nd(db)); |
| 348 } |
| 349 dst += 1; |
| 340 } | 350 } |
| 341 } | 351 } |
| 342 | 352 |
| 343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. | 353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. |
| 344 * each dither value is spaced out into byte lanes, and repeated | 354 * each dither value is spaced out into byte lanes, and repeated |
| 345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the | 355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the |
| 346 * start of each row. | 356 * start of each row. |
| 347 */ | 357 */ |
| 348 static const uint8_t gDitherMatrix_Neon[48] = { | 358 static const uint8_t gDitherMatrix_Neon[48] = { |
| 349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, | 359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, |
| (...skipping 1084 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1434 * case where we do not inspect the src alpha. | 1444 * case where we do not inspect the src alpha. |
| 1435 */ | 1445 */ |
| 1436 #if SK_A32_SHIFT == 24 | 1446 #if SK_A32_SHIFT == 24 |
| 1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1439 #else | 1449 #else |
| 1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1441 #endif | 1451 #endif |
| 1442 S32A_Blend_BlitRow32_neon // S32A_Blend | 1452 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1443 }; | 1453 }; |
| OLD | NEW |