OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
223 "21: \n\t" | 223 "21: \n\t" |
224 : [count] "+r" (count) | 224 : [count] "+r" (count) |
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s
rc) | 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s
rc) |
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6"
,"d7", | 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6"
,"d7", |
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25
","d26","d27","d28","d29", | 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25
","d26","d27","d28","d29", |
228 "d30","d31" | 228 "d30","d31" |
229 ); | 229 ); |
230 } | 230 } |
231 } | 231 } |
232 | 232 |
| 233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
| 234 prod += vdupq_n_u16(128); |
| 235 prod += vshrq_n_u16(prod, 8); |
| 236 return vshrq_n_u16(prod, 8); |
| 237 } |
| 238 |
233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
234 const SkPMColor* SK_RESTRICT src, int count, | 240 const SkPMColor* SK_RESTRICT src, int count, |
235 U8CPU alpha, int /*x*/, int /*y*/) { | 241 U8CPU alpha, int /*x*/, int /*y*/) { |
| 242 SkASSERT(255 > alpha); |
236 | 243 |
237 U8CPU alpha_for_asm = alpha; | 244 /* This code implements a Neon version of S32A_D565_Blend. The results have |
238 | 245 * a few mismatches compared to the original code. These mismatches never |
239 asm volatile ( | 246 * exceed 1. |
240 /* This code implements a Neon version of S32A_D565_Blend. The output differ
s from | |
241 * the original in two respects: | |
242 * 1. The results have a few mismatches compared to the original code. Thes
e mismatches | |
243 * never exceed 1. It's possible to improve accuracy vs. a floating poin
t | |
244 * implementation by introducing rounding right shifts (vrshr) for the f
inal stage. | |
245 * Rounding is not present in the code below, because although results w
ould be closer | |
246 * to a floating point implementation, the number of mismatches compared
to the | |
247 * original code would be far greater. | |
248 * 2. On certain inputs, the original code can overflow, causing colour cha
nnels to | |
249 * mix. Although the Neon code can also overflow, it doesn't allow one c
olour channel | |
250 * to affect another. | |
251 */ | 247 */ |
252 | 248 |
253 #if 1 | 249 if (count >= 8) { |
254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ | 250 uint16x8_t valpha_max, vmask_blue; |
255 "add %[alpha], %[alpha], #1 \n\t" // adjust r
ange of alpha 0-256 | 251 uint8x8_t valpha; |
| 252 |
| 253 // prepare constants |
| 254 valpha_max = vmovq_n_u16(255); |
| 255 valpha = vdup_n_u8(alpha); |
| 256 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
| 257 |
| 258 do { |
| 259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 260 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
| 261 uint8x8x4_t vsrc; |
| 262 |
| 263 // load pixels |
| 264 vdst = vld1q_u16(dst); |
| 265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 266 asm ( |
| 267 "vld4.u8 %h[vsrc], [%[src]]!" |
| 268 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 269 : : |
| 270 ); |
256 #else | 271 #else |
257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" //
adjust range of alpha 0-256 | 272 register uint8x8_t d0 asm("d0"); |
258 #endif | 273 register uint8x8_t d1 asm("d1"); |
259 "vmov.u16 q3, #255 \n\t" // set up
constant | 274 register uint8x8_t d2 asm("d2"); |
260 "movs r4, %[count], lsr #3 \n\t" // calc. c
ount>>3 | 275 register uint8x8_t d3 asm("d3"); |
261 "vmov.u16 d2[0], %[alpha] \n\t" // move al
pha to Neon | |
262 "beq 2f \n\t" // if coun
t8 == 0, exit | |
263 "vmov.u16 q15, #0x1f \n\t" // set up
blue mask | |
264 | 276 |
265 "1: \n\t" | 277 asm volatile ( |
266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load ei
ght dst RGB565 pixels | 278 "vld4.u8 {d0-d3},[%[src]]!;" |
267 "subs r4, r4, #1 \n\t" // decreme
nt loop counter | 279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load ei
ght src ABGR32 pixels | 280 [src] "+&r" (src) |
269 // and deinterleave | 281 : : |
270 | 282 ); |
271 "vshl.u16 q9, q0, #5 \n\t" // shift g
reen to top of lanes | 283 vsrc.val[0] = d0; |
272 "vand q10, q0, q15 \n\t" // extract
blue | 284 vsrc.val[1] = d1; |
273 "vshr.u16 q8, q0, #11 \n\t" // extract
red | 285 vsrc.val[2] = d2; |
274 "vshr.u16 q9, q9, #10 \n\t" // extract
green | 286 vsrc.val[3] = d3; |
275 // dstrgb = {q8, q9, q10} | |
276 | |
277 "vshr.u8 d24, d24, #3 \n\t" // shift r
ed to 565 range | |
278 "vshr.u8 d25, d25, #2 \n\t" // shift g
reen to 565 range | |
279 "vshr.u8 d26, d26, #3 \n\t" // shift b
lue to 565 range | |
280 | |
281 "vmovl.u8 q11, d24 \n\t" // widen r
ed to 16 bits | |
282 "vmovl.u8 q12, d25 \n\t" // widen g
reen to 16 bits | |
283 "vmovl.u8 q14, d27 \n\t" // widen a
lpha to 16 bits | |
284 "vmovl.u8 q13, d26 \n\t" // widen b
lue to 16 bits | |
285 // srcrgba = {q11, q12, q13, q14} | |
286 | |
287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * sr
c_scale | |
288 "vmul.u16 q11, q11, d2[0] \n\t" // red res
ult = src_red * src_scale | |
289 "vmul.u16 q12, q12, d2[0] \n\t" // grn res
ult = src_grn * src_scale | |
290 "vmul.u16 q13, q13, d2[0] \n\t" // blu res
ult = src_blu * src_scale | |
291 | |
292 "vshr.u16 q2, q2, #8 \n\t" // sa * sr
c_scale >> 8 | |
293 "vsub.u16 q2, q3, q2 \n\t" // 255 - (
sa * src_scale >> 8) | |
294 // dst_scale = q2 | |
295 | |
296 "vmla.u16 q11, q8, q2 \n\t" // red res
ult += dst_red * dst_scale | |
297 "vmla.u16 q12, q9, q2 \n\t" // grn res
ult += dst_grn * dst_scale | |
298 "vmla.u16 q13, q10, q2 \n\t" // blu res
ult += dst_blu * dst_scale | |
299 | |
300 #if 1 | |
301 // trying for a better match with SkDiv255Round(a) | |
302 // C alg is: a+=128; (a+a>>8)>>8 | |
303 // we'll use just a rounding shift [q2 is available for scratch] | |
304 "vrshr.u16 q11, q11, #8 \n\t" // shift
down red | |
305 "vrshr.u16 q12, q12, #8 \n\t" // shift
down green | |
306 "vrshr.u16 q13, q13, #8 \n\t" // shift
down blue | |
307 #else | |
308 // arm's original "truncating divide by 256" | |
309 "vshr.u16 q11, q11, #8 \n\t" // shift d
own red | |
310 "vshr.u16 q12, q12, #8 \n\t" // shift d
own green | |
311 "vshr.u16 q13, q13, #8 \n\t" // shift d
own blue | |
312 #endif | 287 #endif |
313 | 288 |
314 "vsli.u16 q13, q12, #5 \n\t" // insert
green into blue | |
315 "vsli.u16 q13, q11, #11 \n\t" // insert
red into green/blue | |
316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write p
ixel back to dst, update ptr | |
317 | 289 |
318 "bne 1b \n\t" // if coun
ter != 0, loop | 290 // deinterleave dst |
319 "2: \n\t" // exi
t | 291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes |
| 292 vdst_b = vdst & vmask_blue; // extract blue |
| 293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
| 294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green |
320 | 295 |
321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [a
lpha] "+r" (alpha_for_asm) | 296 // shift src to 565 |
322 : | 297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d
6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
"d26", "d27", "d28", "d29", "d30", "d31" | 298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); |
324 ); | 299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); |
325 | 300 |
326 count &= 7; | 301 // calc src * src_scale |
327 if (count > 0) { | 302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); |
328 do { | 303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); |
329 SkPMColor sc = *src++; | 304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); |
330 if (sc) { | 305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); |
331 uint16_t dc = *dst; | 306 |
332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc),
alpha); | 307 // prepare dst_scale |
333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(Sk
GetPackedR16(dc), dst_scale); | 308 vres_a = SkDiv255Round_neon8(vres_a); |
334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(Sk
GetPackedG16(dc), dst_scale); | 309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 |
335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(Sk
GetPackedB16(dc), dst_scale); | 310 |
336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv25
5Round(db)); | 311 // add dst * dst_scale to previous result |
337 } | 312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); |
338 dst += 1; | 313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); |
339 } while (--count != 0); | 314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); |
| 315 |
| 316 #ifdef S32A_D565_BLEND_EXACT |
| 317 // It is possible to get exact results with this but it is slow, |
| 318 // even slower than C code in some cases |
| 319 vres_r = SkDiv255Round_neon8(vres_r); |
| 320 vres_g = SkDiv255Round_neon8(vres_g); |
| 321 vres_b = SkDiv255Round_neon8(vres_b); |
| 322 #else |
| 323 vres_r = vrshrq_n_u16(vres_r, 8); |
| 324 vres_g = vrshrq_n_u16(vres_g, 8); |
| 325 vres_b = vrshrq_n_u16(vres_b, 8); |
| 326 #endif |
| 327 // pack result |
| 328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green
into blue |
| 329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red in
to green/blue |
| 330 |
| 331 // store |
| 332 vst1q_u16(dst, vres_b); |
| 333 dst += 8; |
| 334 count -= 8; |
| 335 } while (count >= 8); |
| 336 } |
| 337 |
| 338 // leftovers |
| 339 while (count-- > 0) { |
| 340 SkPMColor sc = *src++; |
| 341 if (sc) { |
| 342 uint16_t dc = *dst; |
| 343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alph
a); |
| 344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetP
ackedR16(dc), dst_scale); |
| 345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetP
ackedG16(dc), dst_scale); |
| 346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetP
ackedB16(dc), dst_scale); |
| 347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Rou
nd(db)); |
| 348 } |
| 349 dst += 1; |
340 } | 350 } |
341 } | 351 } |
342 | 352 |
343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. | 353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. |
344 * each dither value is spaced out into byte lanes, and repeated | 354 * each dither value is spaced out into byte lanes, and repeated |
345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the | 355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the |
346 * start of each row. | 356 * start of each row. |
347 */ | 357 */ |
348 static const uint8_t gDitherMatrix_Neon[48] = { | 358 static const uint8_t gDitherMatrix_Neon[48] = { |
349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, | 359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, |
(...skipping 1084 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1434 * case where we do not inspect the src alpha. | 1444 * case where we do not inspect the src alpha. |
1435 */ | 1445 */ |
1436 #if SK_A32_SHIFT == 24 | 1446 #if SK_A32_SHIFT == 24 |
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1439 #else | 1449 #else |
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1441 #endif | 1451 #endif |
1442 S32A_Blend_BlitRow32_neon // S32A_Blend | 1452 S32A_Blend_BlitRow32_neon // S32A_Blend |
1443 }; | 1453 }; |
OLD | NEW |