src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 847363002: skia: blend32_16_row for neon version

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 847363002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 447 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
458 SkPMColorAssert(c);	458 SkPMColorAssert(c);

459 if (c) {	459 if (c) {

460 dst = SkSrcOver32To16(c, dst);	460 dst = SkSrcOver32To16(c, dst);

461 }	461 }

462 dst += 1;	462 dst += 1;

463 } while (--count != 0);	463 } while (--count != 0);

464 }	464 }

465 }	465 }

466 #endif // #ifdef SK_CPU_ARM32	466 #endif // #ifdef SK_CPU_ARM32

467	467

	468 static uint32_t pmcolor_to_expand16(SkPMColor c) {

	469 unsigned r = SkGetPackedR32(c);

	470 unsigned g = SkGetPackedG32(c);

	471 unsigned b = SkGetPackedB32(c);

	472 return (g << 24) \| (r << 13) \| (b << 2);

	473 }

	474

	475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src[], int count, int x, int y ) {

	476 uint32_t src_expand;

	477 unsigned scale;

	478

	479 if (count <= 0) return;

	480 SkASSERT(((size_t)dst & 0x01) == 0);

	481

	482 /*

	483 * This preamble code is in order to make dst aligned to 8 bytes

	484 * in the next mutiple bytes read & write access.

	485 */

	486 src_expand = pmcolor_to_expand16(*src);

	487 scale = SkAlpha255To256(0xFF - SkGetPackedA32(*src)) >> 3;

	488

	489 #define DST_ALIGN 8

	490

	491 /*

	492 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.

	493 */

	494 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);

	495

	496 for (int i = 0; i < preamble_size; i+=2, dst++) {

	497 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	498 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	499 if (--count == 0)

	500 break;

	501 }

	502 #ifdef SK_CPU_ARM64

	503 asm (

	504 "lsr x2, %[count], #4 \n\t"

	505 "mov x1, x2 \n\t" // calc. count>>4

	506 "cbz x1, back \n\t" // if (count>>4) == 0, exit

	507 "ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[src]] \n\t" // load eight src ABGR32 pixels

	508

	509 "uxtl v4.8h, v24.8b \n\t" // widen red to 16 bits

	510 "uxtl v5.8h, v25.8b \n\t" // widen green to 16 bits

	511 "uxtl v6.8h, v26.8b \n\t" // widen blue to 16 bits

	512

	513 "shl v4.8h, v4.8h, #2 \n\t" // src red = src_red << 2 (later will do >> 5 to make 5 bit red)

	514 "shl v5.8h, v5.8h, #3 \n\t" // src grn = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	515 "shl v6.8h, v6.8h, #2 \n\t" // src blu = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	516

	517 "movi v21.8h, #1, lsl#8 \n\t" // set up constant 256 (1<<8)

	518 "uxtl v14.8h, v27.8b \n\t" // widen alpha to 16 bits

	519 "sub v14.8h, v21.8h, v14.8h \n\t" // 256 - sa

	520 "ushr v14.8h, v14.8h, #3 \n\t" // (256 - sa) >> 3

	521

	522 "front: \n\t"

	523 "ld1 {v0.8h, v1.8h}, [%[dst]] \n\t" // load sixteen dst RGB565 pixels

	524 //set PREFETCH_DISTANCE to 128

	525 "prfum pldl1keep, [%[dst], #128] \n\t"

	526

	527 "subs x1, x1, #1 \n\t" // decrement loop counter

	528

	529 "shl v9.8h, v0.8h, #5 \n\t" // shift green to top of lanes

	530

	531 "shl v10.8h, v0.8h, #11 \n\t" // shift blue to top of lanes

	532 "ushr v10.8h, v10.8h, #11 \n\t" // extract blue

	533

	534 "ushr v8.8h, v0.8h, #11 \n\t" // extract red

	535 "ushr v9.8h, v9.8h, #10 \n\t" // extract green

	536

	537 "shl v3.8h, v1.8h, #5 \n\t" // shift green to top of lanes

	538

	539 "shl v7.8h, v1.8h, #11 \n\t" // shift blue to top of lanes

	540 "ushr v7.8h, v7.8h, #11 \n\t" // extract blue

	541

	542 "ushr v2.8h, v1.8h, #11 \n\t" // extract red

	543 "ushr v3.8h, v3.8h, #10 \n\t" // extract green

	544

	545 //If we use src in mla, directly, vd.8h is updated by mla, so need to calculate src in next loop again, which requires 12 cycles.

	546 //Instead, copy src to other registers and use them as a mla d estination.

	547 //6 shl commands are needed, but we don't need to update src. In total, will get 6 cycle benefit in each loop.

	548

	549 "shl v15.8h, v4.8h, #0 \n\t" // copy dst 0 red result

	550 "shl v16.8h, v5.8h, #0 \n\t" // copy dst 0 grn result

	551 "shl v17.8h, v6.8h, #0 \n\t" // copy dst 0 blu result

	552 "mla v15.8h, v8.8h, v14.8h \n\t" // dst 0 red result = dst_red * dst_scale

	553 "mla v16.8h, v9.8h, v14.8h \n\t" // dst 0 grn result = dst_grn * dst_scale

	554 "mla v17.8h, v10.8h, v14.8h \n\t" // dst 0 blu result = dst_blu * dst_scale

	555

	556 "shl v20.8h, v4.8h, #0 \n\t" // copy dst 1 red result

	557 "shl v19.8h, v5.8h, #0 \n\t" // copy dst 1 grn result

	558 "shl v18.8h, v6.8h, #0 \n\t" // copy dst 1 blu result

	559 "mla v20.8h, v2.8h, v14.8h \n\t" // dst 1 red result = dst_red * dst_scale

	560 "mla v19.8h, v3.8h, v14.8h \n\t" // dst 1 grn result = dst_grn * dst_scale

	561 "mla v18.8h, v7.8h, v14.8h \n\t" // dst 1 blu result = dst_blu * dst_scale

	562

	563 "ushr v15.8h, v15.8h, #5 \n\t" // dst 0 red result >> 5

	564 "ushr v16.8h, v16.8h, #5 \n\t" // dst 0 grn result >> 5

	565 "ushr v17.8h, v17.8h, #5 \n\t" // dst 0 blu result >> 5

	566

	567 "ushr v20.8h, v20.8h, #5 \n\t" // dst 1 red result >> 5

	568 "ushr v19.8h, v19.8h, #5 \n\t" // dst 1 grn result >> 5

	569 "ushr v18.8h, v18.8h, #5 \n\t" // dst 1 blu result >> 5

	570

	571 "sli v17.8h, v16.8h, #5 \n\t" // dst 0 insert green into blue

	572 "sli v17.8h, v15.8h, #11 \n\t" // dst 0 insert red into green/blue

	573

	574 "sli v18.8h, v19.8h, #5 \n\t" // dst 1 insert green into blue

	575 "sli v18.8h, v20.8h, #11 \n\t" // dst 1 insert red into green/blue

	576

	577 "st1 {v17.8h, v18.8h}, [%[dst]], #32 \n\t" // write pixel back to dst 0 and dst 1, update ptr

	578 "cbnz x1, front \n\t" // if counter !=0, loop

	579 "back: \n\t" // exit

	580

	581 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

	582 : : "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" , "v8", "v9", "v10", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",

	583 "v24", "v25", "v26", "v27"

	584 );

	585 #else

	586 asm volatile (

	587 "movs r4, %[count], lsr #4 \n\t" // cal c. count>>4

	588 "beq 2f \n\t" // if (count>>4) == 0, exit

	589 "vmov.u16 q15, #0x1f \n\t" // set up blue mask

	590 "vld4.u8 {d24, d25, d26, d27}, [%[src]] \n\t" // loa d eight src ABGR32 pixels

	591

	592 "vmov r5, r6, d24 \n\t" // sav e src red in r5, r6

	593 "vmov r7, r8, d25 \n\t" // sav e src green in r7, r8

	594 "vmov r9, r10, d26 \n\t" // sav e src blue in r9, r10

	595 "vmov r11, r12, d27 \n\t" // sav e src alpha in r11, r12

	596

	597

	598 "1: \n\t"

	599 "vld1.u16 {d0, d1, d2, d3}, [%[dst]] \n\t" // loa d sixteen dst RGB565 pixels

	600 //set PREFETCH_DISTANCE to 128

	601 "pld [%[dst], #128] \n\t"

	602

	603 "subs r4, r4, #1 \n\t" // dec rement loop counter

	604

	605 "vmov d24, r5, r6 \n\t" // src red to d24

	606 "vmov d25, r7, r8 \n\t" // src green to d25

	607 "vmov d26, r9, r10 \n\t" // src blue to d26

	608 "vmov d27, r11, r12 \n\t" // src alpha to d27

	609

	610 "vmov.u16 q3, #256 \n\t" // set up constant

	611 "vmovl.u8 q14, d27 \n\t" // wid en alpha to 16 bits

	612 // dst_scale = q14

	613 "vsub.u16 q14, q3, q14 \n\t" // 256 - sa

	614 "vshr.u16 q14, q14, #3 \n\t" // (25 6 - sa) >> 3

	615

	616

	617 // dst_0_rgb = {q8, q9, q10}

	618 "vshl.u16 q9, q0, #5 \n\t" // shi ft green to top of lanes

	619 "vand q10, q0, q15 \n\t" // ext ract blue

	620 "vshr.u16 q8, q0, #11 \n\t" // ext ract red

	621 "vshr.u16 q9, q9, #10 \n\t" // ext ract green

	622

	623 //use q3 for dst_1 green. In the next loop, needs to set q3 to 256 again.

	624 // dst_1_rgb = {q2, q3, q7}

	625 "vshl.u16 q3, q1, #5 \n\t" // shi ft green to top of lanes

	626 "vand q7, q1, q15 \n\t" // ext ract blue

	627 "vshr.u16 q2, q1, #11 \n\t" // ext ract red

	628 "vshr.u16 q3, q3, #10 \n\t" // ext ract green

	629

	630 // srcrgba = {q4, q5, q6, q14}, alpha calculation is done alre ady in above.

	631 // q4, q5, q6 will have each channel's result of dst_1_rgb.

	632 "vmovl.u8 q4, d24 \n\t" // wid en red to 16 bits

	633 "vmovl.u8 q5, d25 \n\t" // wid en green to 16 bits

	634 "vmovl.u8 q6, d26 \n\t" // wid en blue to 16 bits

	635

	636 // srcrgba = {q11, q12, q13, q14}, alpha calculation is done a lready in above.

	637 // q11, q12, q13 will have each channel's result of dst_0_rgb.

	638 "vmovl.u8 q11, d24 \n\t" // wid en red to 16 bits

	639 "vmovl.u8 q12, d25 \n\t" // wid en green to 16 bits

	640 "vmovl.u8 q13, d26 \n\t" // wid en blue to 16 bits

	641

	642 "vshl.u16 q11, q11, #2 \n\t" // dst 0 red result = src_red << 2 (later will do >> 5 to make 5 bit red)

	643 "vshl.u16 q12, q12, #3 \n\t" // dst 0 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	644 "vshl.u16 q13, q13, #2 \n\t" // dst 0 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	645

	646 "vshl.u16 q4, q4, #2 \n\t" // dst 1 red result = src_red << 2 (later will do >> 5 to make 5 bit red)

	647 "vshl.u16 q5, q5, #3 \n\t" // dst 1 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)

	648 "vshl.u16 q6, q6, #2 \n\t" // dst 1 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)

	649

	650 "vmla.u16 q11, q8, q14 \n\t" // dst 0 red result += dst_red * dst_scale

	651 "vmla.u16 q12, q9, q14 \n\t" // dst 0 grn result += dst_grn * dst_scale

	652 "vmla.u16 q13, q10, q14 \n\t" // dst 0 blu result += dst_blu * dst_scale

	653

	654 "vmla.u16 q4, q2, q14 \n\t" // dst 1 red result += dst_red * dst_scale

	655 "vmla.u16 q5, q3, q14 \n\t" // dst 1 grn result += dst_grn * dst_scale

	656 "vmla.u16 q6, q7, q14 \n\t" // dst 1 blu result += dst_blu * dst_scale

	657

	658 "vshr.u16 q11, q11, #5 \n\t" // dst 0 red result >> 5

	659 "vshr.u16 q12, q12, #5 \n\t" // dst 0 grn result >> 5

	660 "vshr.u16 q13, q13, #5 \n\t" // dst 0 blu result >> 5

	661

	662 "vshr.u16 q4, q4, #5 \n\t" // dst 1 red result >> 5

	663 "vshr.u16 q5, q5, #5 \n\t" // dst 1 grn result >> 5

	664 "vshr.u16 q14, q6, #5 \n\t" // dst 1 blu result >> 5

	665

	666 "vsli.u16 q13, q12, #5 \n\t" // dst 0 insert green into blue

	667 "vsli.u16 q13, q11, #11 \n\t" // dst 0 insert red into green/blue

	668

	669 "vsli.u16 q14, q5, #5 \n\t" // dst 1 insert green into blue

	670 "vsli.u16 q14, q4, #11 \n\t" // dst 1 insert red into green/blue

	671

	672 "vst1.16 {d26, d27, d28, d29}, [%[dst]]! \n\t" // wri te pixel back to dst 0 and dst 1, update ptr

	673

	674 "bne 1b \n\t" // if counter != 0, loop

	675 "2: \n\t" // exi t

	676

	677 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

	678 :

	679 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", " r11", "r12",

	680 "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",

	681 "d17", "d18", "d19", "d20", "d21", "d2 2", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"

	682 );

	683 #endif

	684 count &= 0xF;

	685 if (count > 0) {

	686 do {

	687 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	688 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	689 dst += 1;

	690 } while (--count != 0);

	691 }

	692 }

	693

468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {	694 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

469 prod += vdupq_n_u16(128);	695 prod += vdupq_n_u16(128);

470 prod += vshrq_n_u16(prod, 8);	696 prod += vshrq_n_u16(prod, 8);

471 return vshrq_n_u16(prod, 8);	697 return vshrq_n_u16(prod, 8);

472 }	698 }

473	699

474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	700 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

475 const SkPMColor* SK_RESTRICT src, int count,	701 const SkPMColor* SK_RESTRICT src, int count,

476 U8CPU alpha, int /x/, int /y/) {	702 U8CPU alpha, int /x/, int /y/) {

477 SkASSERT(255 > alpha);	703 SkASSERT(255 > alpha);

(...skipping 1180 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1658 // https://code.google.com/p/skia/issues/detail?id=2797	1884 // https://code.google.com/p/skia/issues/detail?id=2797

1659 #endif	1885 #endif

1660	1886

1661 // dither	1887 // dither

1662 S32_D565_Opaque_Dither_neon,	1888 S32_D565_Opaque_Dither_neon,

1663 S32_D565_Blend_Dither_neon,	1889 S32_D565_Blend_Dither_neon,

1664 S32A_D565_Opaque_Dither_neon,	1890 S32A_D565_Opaque_Dither_neon,

1665 NULL, // S32A_D565_Blend_Dither	1891 NULL, // S32A_D565_Blend_Dither

1666 };	1892 };

1667	1893

	1894 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {

	1895 #if 0

	1896 Color32_D565_neon,

	1897 Color32A_D565_neon,

	1898 Color32_D565_Dither_neon,

	1899 Color32A_D565_Dither_neon

	1900 #else

	1901 // TODO: stop cheating and fill in the above specializations!

	1902 Color32A_D565_neon,

	1903 Color32A_D565_neon,

	1904 Color32A_D565_neon,

	1905 Color32A_D565_neon,

	1906 #endif

	1907 };

	1908

1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1909 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1669 NULL, // S32_Opaque,	1910 NULL, // S32_Opaque,

1670 S32_Blend_BlitRow32_neon, // S32_Blend,	1911 S32_Blend_BlitRow32_neon, // S32_Blend,

1671 /*	1912 /*

1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha	1913 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1673 * value and attempts to optimize accordingly. The optimization is	1914 * value and attempts to optimize accordingly. The optimization is

1674 * sensitive to the source content and is not a win in all cases. For	1915 * sensitive to the source content and is not a win in all cases. For

1675 * example, if there are a lot of transitions between the alpha states,	1916 * example, if there are a lot of transitions between the alpha states,

1676 * the performance will almost certainly be worse. However, for many	1917 * the performance will almost certainly be worse. However, for many

1677 * common cases the performance is equivalent or better than the standard	1918 * common cases the performance is equivalent or better than the standard

1678 * case where we do not inspect the src alpha.	1919 * case where we do not inspect the src alpha.

1679 */	1920 */

1680 #if SK_A32_SHIFT == 24	1921 #if SK_A32_SHIFT == 24

1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1922 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1923 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1683 #else	1924 #else

1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1925 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1685 #endif	1926 #endif

1686 #ifdef SK_CPU_ARM32	1927 #ifdef SK_CPU_ARM32

1687 S32A_Blend_BlitRow32_neon // S32A_Blend	1928 S32A_Blend_BlitRow32_neon // S32A_Blend

1688 #else	1929 #else

1689 NULL	1930 NULL

1690 #endif	1931 #endif

1691 };	1932 };

OLD	NEW

« src/core/SkBlitter_RGB16.cpp ('K') | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »