src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 845293002: skia: blend32_16_row for neon version

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 845293002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 447 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
458 SkPMColorAssert(c);	458 SkPMColorAssert(c);

459 if (c) {	459 if (c) {

460 dst = SkSrcOver32To16(c, dst);	460 dst = SkSrcOver32To16(c, dst);

461 }	461 }

462 dst += 1;	462 dst += 1;

463 } while (--count != 0);	463 } while (--count != 0);

464 }	464 }

465 }	465 }

466 #endif // #ifdef SK_CPU_ARM32	466 #endif // #ifdef SK_CPU_ARM32

467	467

	468 static uint32_t pmcolor_to_expand16(SkPMColor c) {

	469 unsigned r = SkGetPackedR32(c);

	470 unsigned g = SkGetPackedG32(c);

	471 unsigned b = SkGetPackedB32(c);

	472 return (g << 24) \| (r << 13) \| (b << 2);

	473 }

	474

	475 void blend32_16_row_neon(const SkPMColor* SK_RESTRICT src,

	476 uint16_t dst[], int count) {

	477 uint32_t src_expand;

	478 unsigned scale;

	479

	480 if (count <= 0) return;

	481 SkASSERT(((size_t)dst & 0x01) == 0);

	482

	483 /*

	484 * This preamble code is in order to make dst aligned to 8 bytes

	485 * in the next mutiple bytes read & write access.

	486 */

	487 src_expand = pmcolor_to_expand16(*src);

	488 scale = SkAlpha255To256(0xFF - SkGetPackedA32(*src)) >> 3;

	489

	490 #define DST_ALIGN 8

	491

	492 /*

	493 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.

	494 */

	495 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);

	496

	497 for (int i = 0; i < preamble_size; i+=2, dst++) {

	498 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	499 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	500 if (--count == 0)

	501 break;

	502 }

	503 #ifdef SK_CPU_ARM64

	504 asm (

	505 "lsr x2, %[count], #4 \n\t"

	506 "mov x1, x2 \n\t" // calc. count>>4

	507 "cbz x1, back \n\t" // if (count>>4) == 0, exit

	508 "ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[src]] \n\t" // load eight src ABGR32 pixels

	509

	510 "uxtl v4.8h, v24.8b \n\t" // widen red to 16 bits

	511 "uxtl v5.8h, v25.8b \n\t" // widen green to 16 bits

	512 "uxtl v6.8h, v26.8b \n\t" // widen blue to 16 bits

	513

	514 "shl v4.8h, v4.8h, #2 \n\t" // src red = src_red << 2 (later will do >> 5 to make 5 bit red)

	515 "shl v5.8h, v5.8h, #3 \n\t" // src grn = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	516 "shl v6.8h, v6.8h, #2 \n\t" // src blu = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	517

	518 "movi v21.8h, #1, lsl#8 \n\t" // set up constant 256 (1<<8)

	519 "uxtl v14.8h, v27.8b \n\t" // widen alpha to 16 bits

	520 "sub v14.8h, v21.8h, v14.8h \n\t" // 256 - sa

	521 "ushr v14.8h, v14.8h, #3 \n\t" // (256 - sa) >> 3

	522

	523 "front: \n\t"

	524 "ld1 {v0.8h, v1.8h}, [%[dst]] \n\t" // load sixteen dst RGB565 pixels

	525 //set PREFETCH_DISTANCE to 128

	526 "prfum pldl1keep, [%[dst], #128] \n\t"

	527

	528 "subs x1, x1, #1 \n\t" // decrement loop counter

	529

	530 "shl v9.8h, v0.8h, #5 \n\t" // shift green to top of lanes

	531

	532 "shl v10.8h, v0.8h, #11 \n\t" // shift blue to top of lanes

	533 "ushr v10.8h, v10.8h, #11 \n\t" // extract blue

	534

	535 "ushr v8.8h, v0.8h, #11 \n\t" // extract red

	536 "ushr v9.8h, v9.8h, #10 \n\t" // extract green

	537

	538 "shl v3.8h, v1.8h, #5 \n\t" // shift green to top of lanes

	539

	540 "shl v7.8h, v1.8h, #11 \n\t" // shift blue to top of lanes

	541 "ushr v7.8h, v7.8h, #11 \n\t" // extract blue

	542

	543 "ushr v2.8h, v1.8h, #11 \n\t" // extract red

	544 "ushr v3.8h, v3.8h, #10 \n\t" // extract green

	545

	546 //If we use src in mla, directly, vd.8h is updated by mla, so need to calculate src in next loop again, which requires 12 cycles.

	547 //Instead, copy src to other registers and use them as a mla d estination.

	548 //6 shl commands are needed, but we don't need to update src. In total, will get 6 cycle benefit in each loop.

	549

	550 "shl v15.8h, v4.8h, #0 \n\t" // copy dst 0 red result

	551 "shl v16.8h, v5.8h, #0 \n\t" // copy dst 0 grn result

	552 "shl v17.8h, v6.8h, #0 \n\t" // copy dst 0 blu result

	553 "mla v15.8h, v8.8h, v14.8h \n\t" // dst 0 red result = dst_red * dst_scale

	554 "mla v16.8h, v9.8h, v14.8h \n\t" // dst 0 grn result = dst_grn * dst_scale

	555 "mla v17.8h, v10.8h, v14.8h \n\t" // dst 0 blu result = dst_blu * dst_scale

	556

	557 "shl v20.8h, v4.8h, #0 \n\t" // copy dst 1 red result

	558 "shl v19.8h, v5.8h, #0 \n\t" // copy dst 1 grn result

	559 "shl v18.8h, v6.8h, #0 \n\t" // copy dst 1 blu result

	560 "mla v20.8h, v2.8h, v14.8h \n\t" // dst 1 red result = dst_red * dst_scale

	561 "mla v19.8h, v3.8h, v14.8h \n\t" // dst 1 grn result = dst_grn * dst_scale

	562 "mla v18.8h, v7.8h, v14.8h \n\t" // dst 1 blu result = dst_blu * dst_scale

	563

	564 "ushr v15.8h, v15.8h, #5 \n\t" // dst 0 red result >> 5

	565 "ushr v16.8h, v16.8h, #5 \n\t" // dst 0 grn result >> 5

	566 "ushr v17.8h, v17.8h, #5 \n\t" // dst 0 blu result >> 5

	567

	568 "ushr v20.8h, v20.8h, #5 \n\t" // dst 1 red result >> 5

	569 "ushr v19.8h, v19.8h, #5 \n\t" // dst 1 grn result >> 5

	570 "ushr v18.8h, v18.8h, #5 \n\t" // dst 1 blu result >> 5

	571

	572 "sli v17.8h, v16.8h, #5 \n\t" // dst 0 insert green into blue

	573 "sli v17.8h, v15.8h, #11 \n\t" // dst 0 insert red into green/blue

	574

	575 "sli v18.8h, v19.8h, #5 \n\t" // dst 1 insert green into blue

	576 "sli v18.8h, v20.8h, #11 \n\t" // dst 1 insert red into green/blue

	577

	578 "st1 {v17.8h, v18.8h}, [%[dst]], #32 \n\t" // write pixel back to dst 0 and dst 1, update ptr

	579 "cbnz x1, front \n\t" // if counter !=0, loop

	580 "back: \n\t" // exit

	581

	582 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

	583 : : "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" , "v8", "v9", "v10", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",

	584 "v24", "v25", "v26", "v27"

	585 );

	586 #else

	587 asm volatile (

	588 "movs r4, %[count], lsr #4 \n\t" // cal c. count>>4

	589 "beq 2f \n\t" // if (count>>4) == 0, exit

	590 "vmov.u16 q15, #0x1f \n\t" // set up blue mask

	591 "vld4.u8 {d24, d25, d26, d27}, [%[src]] \n\t" // loa d eight src ABGR32 pixels

	592

	593 "vmov r5, r6, d24 \n\t" // sav e src red in r5, r6

	594 "vmov r7, r8, d25 \n\t" // sav e src green in r7, r8

	595 "vmov r9, r10, d26 \n\t" // sav e src blue in r9, r10

	596 "vmov r11, r12, d27 \n\t" // sav e src alpha in r11, r12

	597

	598

	599 "1: \n\t"

	600 "vld1.u16 {d0, d1, d2, d3}, [%[dst]] \n\t" // loa d sixteen dst RGB565 pixels

	601 //set PREFETCH_DISTANCE to 128

	602 "pld [%[dst], #128] \n\t"

	603

	604 "subs r4, r4, #1 \n\t" // dec rement loop counter

	605

	606 "vmov d24, r5, r6 \n\t" // src red to d24

	607 "vmov d25, r7, r8 \n\t" // src green to d25

	608 "vmov d26, r9, r10 \n\t" // src blue to d26

	609 "vmov d27, r11, r12 \n\t" // src alpha to d27

	610

	611 "vmov.u16 q3, #256 \n\t" // set up constant

	612 "vmovl.u8 q14, d27 \n\t" // wid en alpha to 16 bits

	613 // dst_scale = q14

	614 "vsub.u16 q14, q3, q14 \n\t" // 256 - sa

	615 "vshr.u16 q14, q14, #3 \n\t" // (25 6 - sa) >> 3

	616

	617

	618 // dst_0_rgb = {q8, q9, q10}

	619 "vshl.u16 q9, q0, #5 \n\t" // shi ft green to top of lanes

	620 "vand q10, q0, q15 \n\t" // ext ract blue

	621 "vshr.u16 q8, q0, #11 \n\t" // ext ract red

	622 "vshr.u16 q9, q9, #10 \n\t" // ext ract green

	623

	624 //use q3 for dst_1 green. In the next loop, needs to set q3 to 256 again.

	625 // dst_1_rgb = {q2, q3, q7}

	626 "vshl.u16 q3, q1, #5 \n\t" // shi ft green to top of lanes

	627 "vand q7, q1, q15 \n\t" // ext ract blue

	628 "vshr.u16 q2, q1, #11 \n\t" // ext ract red

	629 "vshr.u16 q3, q3, #10 \n\t" // ext ract green

	630

	631 // srcrgba = {q4, q5, q6, q14}, alpha calculation is done alre ady in above.

	632 // q4, q5, q6 will have each channel's result of dst_1_rgb.

	633 "vmovl.u8 q4, d24 \n\t" // wid en red to 16 bits

	634 "vmovl.u8 q5, d25 \n\t" // wid en green to 16 bits

	635 "vmovl.u8 q6, d26 \n\t" // wid en blue to 16 bits

	636

	637 // srcrgba = {q11, q12, q13, q14}, alpha calculation is done a lready in above.

	638 // q11, q12, q13 will have each channel's result of dst_0_rgb.

	639 "vmovl.u8 q11, d24 \n\t" // wid en red to 16 bits

	640 "vmovl.u8 q12, d25 \n\t" // wid en green to 16 bits

	641 "vmovl.u8 q13, d26 \n\t" // wid en blue to 16 bits

	642

	643 "vshl.u16 q11, q11, #2 \n\t" // dst 0 red result = src_red << 2 (later will do >> 5 to make 5 bit red)

	644 "vshl.u16 q12, q12, #3 \n\t" // dst 0 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	645 "vshl.u16 q13, q13, #2 \n\t" // dst 0 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	646

	647 "vshl.u16 q4, q4, #2 \n\t" // dst 1 red result = src_red << 2 (later will do >> 5 to make 5 bit red)

	648 "vshl.u16 q5, q5, #3 \n\t" // dst 1 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	649 "vshl.u16 q6, q6, #2 \n\t" // dst 1 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	650

	651 "vmla.u16 q11, q8, q14 \n\t" // dst 0 red result += dst_red * dst_scale

	652 "vmla.u16 q12, q9, q14 \n\t" // dst 0 grn result += dst_grn * dst_scale

	653 "vmla.u16 q13, q10, q14 \n\t" // dst 0 blu result += dst_blu * dst_scale

	654

	655 "vmla.u16 q4, q2, q14 \n\t" // dst 1 red result += dst_red * dst_scale

	656 "vmla.u16 q5, q3, q14 \n\t" // dst 1 grn result += dst_grn * dst_scale

	657 "vmla.u16 q6, q7, q14 \n\t" // dst 1 blu result += dst_blu * dst_scale

	658

	659 "vshr.u16 q11, q11, #5 \n\t" // dst 0 red result >> 5

	660 "vshr.u16 q12, q12, #5 \n\t" // dst 0 grn result >> 5

	661 "vshr.u16 q13, q13, #5 \n\t" // dst 0 blu result >> 5

	662

	663 "vshr.u16 q4, q4, #5 \n\t" // dst 1 red result >> 5

	664 "vshr.u16 q5, q5, #5 \n\t" // dst 1 grn result >> 5

	665 "vshr.u16 q14, q6, #5 \n\t" // dst 1 blu result >> 5

	666

	667 "vsli.u16 q13, q12, #5 \n\t" // dst 0 insert green into blue

	668 "vsli.u16 q13, q11, #11 \n\t" // dst 0 insert red into green/blue

	669

	670 "vsli.u16 q14, q5, #5 \n\t" // dst 1 insert green into blue

	671 "vsli.u16 q14, q4, #11 \n\t" // dst 1 insert red into green/blue

	672

	673 "vst1.16 {d26, d27, d28, d29}, [%[dst]]! \n\t" // wri te pixel back to dst 0 and dst 1, update ptr

	674

	675 "bne 1b \n\t" // if counter != 0, loop

	676 "2: \n\t" // exi t

	677

	678 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

	679 :

	680 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", " r11", "r12",

	681 "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",

	682 "d17", "d18", "d19", "d20", "d21", "d2 2", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"

	683 );

	684 #endif

	685 count &= 0xF;

	686 if (count > 0) {

	687 do {

	688 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	689 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	690 dst += 1;

	691 } while (--count != 0);

	692 }

	693 }

	694

468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {	695 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

469 prod += vdupq_n_u16(128);	696 prod += vdupq_n_u16(128);

470 prod += vshrq_n_u16(prod, 8);	697 prod += vshrq_n_u16(prod, 8);

471 return vshrq_n_u16(prod, 8);	698 return vshrq_n_u16(prod, 8);

472 }	699 }

473	700

474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	701 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

475 const SkPMColor* SK_RESTRICT src, int count,	702 const SkPMColor* SK_RESTRICT src, int count,

476 U8CPU alpha, int /x/, int /y/) {	703 U8CPU alpha, int /x/, int /y/) {

477 SkASSERT(255 > alpha);	704 SkASSERT(255 > alpha);

(...skipping 1204 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1909 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1683 #else	1910 #else

1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1911 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1685 #endif	1912 #endif

1686 #ifdef SK_CPU_ARM32	1913 #ifdef SK_CPU_ARM32

1687 S32A_Blend_BlitRow32_neon // S32A_Blend	1914 S32A_Blend_BlitRow32_neon // S32A_Blend

1688 #else	1915 #else

1689 NULL	1916 NULL

1690 #endif	1917 #endif

1691 };	1918 };

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »