src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 847363002: skia: blend32_16_row for neon version

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 847363002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: skia: blend32_16_row for neon version Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 447 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
458 SkPMColorAssert(c);	458 SkPMColorAssert(c);

459 if (c) {	459 if (c) {

460 dst = SkSrcOver32To16(c, dst);	460 dst = SkSrcOver32To16(c, dst);

461 }	461 }

462 dst += 1;	462 dst += 1;

463 } while (--count != 0);	463 } while (--count != 0);

464 }	464 }

465 }	465 }

466 #endif // #ifdef SK_CPU_ARM32	466 #endif // #ifdef SK_CPU_ARM32

467	467

	468 static uint32_t pmcolor_to_expand16(SkPMColor c) {

	469 unsigned r = SkGetPackedR32(c);

	470 unsigned g = SkGetPackedG32(c);

	471 unsigned b = SkGetPackedB32(c);

	472 return (g << 24) \| (r << 13) \| (b << 2);

	473 }

	474

	475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {

	476 uint32_t src_expand;

	477 unsigned scale;

	478 uint16x8_t vmask_blue;

	479

	480 if (count <= 0) return;

	481 SkASSERT(((size_t)dst & 0x01) == 0);

	482

	483 /*

	484 * This preamble code is in order to make dst aligned to 8 bytes

	485 * in the next mutiple bytes read & write access.

	486 */

	487 src_expand = pmcolor_to_expand16(src);

	488 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

	489

	490 #define DST_ALIGN 8

	491

	492 /*

	493 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.

	494 */

	495 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);

	496

	497 for (int i = 0; i < preamble_size; i+=2, dst++) {

	498 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	499 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	500 if (--count == 0)

	501 break;

	502 }

	503

	504 int count16 = 0;

	505 count16 = count >> 4;

	506 vmask_blue = vmovq_n_u16(SK_B16_MASK);

	507

	508 if (count16) {

	509 uint16x8_t wide_sr;

	510 uint16x8_t wide_sg;

	511 uint16x8_t wide_sb;

	512 uint16x8_t wide_256_sa;

	513

	514 unsigned sr = SkGetPackedR32(src);

	515 unsigned sg = SkGetPackedG32(src);

	516 unsigned sb = SkGetPackedB32(src);

	517 unsigned sa = SkGetPackedA32(src);

	518

	519 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb

	520 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,

	521 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)

	522 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift

	523

	524 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,

	525 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)

	526 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift

	527

	528 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,

	529 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)

	530 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift

	531

	532 wide_256_sa =

	533 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3

	534

	535 while (count16-- > 0) {

	536 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;

	537 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;

	538 vdst1 = vld1q_u16(dst);

	539 dst += 8;

	540 vdst2 = vld1q_u16(dst);

	541 dst -= 8; //to store dst again.

	542

	543 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes

	544 vdst1_b = vdst1 & vmask_blue; // extrac t blue

	545 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extrac t red

	546 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extrac t green

	547

	548 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes

	549 vdst2_b = vdst2 & vmask_blue; // extrac t blue

	550 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extrac t red

	551 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extrac t green

	552

	553 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + ( 256-sa) x dr1

	554 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + ( 256-sa) x dg1

	555 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + ( 256-sa) x db1

	556

	557 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + ( 256-sa) x dr2

	558 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + ( 256-sa) x dg2

	559 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + ( 256-sa) x db2

	560

	561 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red

	562 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green

	563 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue

	564

	565 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue

	566 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue

	567

	568 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red

	569 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green

	570 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue

	571

	572 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue

	573 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue

	574

	575 vst1q_u16(dst, vdst1);

	576 dst += 8;

	577 vst1q_u16(dst, vdst2);

	578 dst += 8;

	579 }

	580 }

	581

	582 count &= 0xF;

	583 if (count > 0) {

	584 do {

	585 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	586 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	587 dst += 1;

	588 } while (--count != 0);

	589 }

	590 }

	591

468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {	592 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

469 prod += vdupq_n_u16(128);	593 prod += vdupq_n_u16(128);

470 prod += vshrq_n_u16(prod, 8);	594 prod += vshrq_n_u16(prod, 8);

471 return vshrq_n_u16(prod, 8);	595 return vshrq_n_u16(prod, 8);

472 }	596 }

473	597

474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	598 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

475 const SkPMColor* SK_RESTRICT src, int count,	599 const SkPMColor* SK_RESTRICT src, int count,

476 U8CPU alpha, int /x/, int /y/) {	600 U8CPU alpha, int /x/, int /y/) {

477 SkASSERT(255 > alpha);	601 SkASSERT(255 > alpha);

(...skipping 1180 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1658 // https://code.google.com/p/skia/issues/detail?id=2797	1782 // https://code.google.com/p/skia/issues/detail?id=2797

1659 #endif	1783 #endif

1660	1784

1661 // dither	1785 // dither

1662 S32_D565_Opaque_Dither_neon,	1786 S32_D565_Opaque_Dither_neon,

1663 S32_D565_Blend_Dither_neon,	1787 S32_D565_Blend_Dither_neon,

1664 S32A_D565_Opaque_Dither_neon,	1788 S32A_D565_Opaque_Dither_neon,

1665 NULL, // S32A_D565_Blend_Dither	1789 NULL, // S32A_D565_Blend_Dither

1666 };	1790 };

1667	1791

	1792 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {

	1793 #if 0
	djsollen 2015/01/21 14:36:53 remove the #if 0 block and replace with... Color remove the #if 0 block and replace with... Color32A_D565_neon, // Color32_D565, Color32A_D565_neon, // Color32A_D565, Color32A_D565_neon, // Color32_D565_Dither, Color32A_D565_neon // Color32A_D565_Dither mlee 2015/01/22 12:19:35 Done. Show quoted text On 2015/01/21 14:36:53, djsollen wrote: > remove the #if 0 block and replace with... > > Color32A_D565_neon, // Color32_D565, > Color32A_D565_neon, // Color32A_D565, > Color32A_D565_neon, // Color32_D565_Dither, > Color32A_D565_neon // Color32A_D565_Dither Done.
	1794 Color32_D565_neon,

	1795 Color32A_D565_neon,

	1796 Color32_D565_Dither_neon,

	1797 Color32A_D565_Dither_neon

	1798 #else

	1799 // TODO: stop cheating and fill in the above specializations!

	1800 Color32A_D565_neon,

	1801 Color32A_D565_neon,

	1802 Color32A_D565_neon,

	1803 Color32A_D565_neon,

	1804 #endif

	1805 };

	1806

1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1807 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1669 NULL, // S32_Opaque,	1808 NULL, // S32_Opaque,

1670 S32_Blend_BlitRow32_neon, // S32_Blend,	1809 S32_Blend_BlitRow32_neon, // S32_Blend,

1671 /*	1810 /*

1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha	1811 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1673 * value and attempts to optimize accordingly. The optimization is	1812 * value and attempts to optimize accordingly. The optimization is

1674 * sensitive to the source content and is not a win in all cases. For	1813 * sensitive to the source content and is not a win in all cases. For

1675 * example, if there are a lot of transitions between the alpha states,	1814 * example, if there are a lot of transitions between the alpha states,

1676 * the performance will almost certainly be worse. However, for many	1815 * the performance will almost certainly be worse. However, for many

1677 * common cases the performance is equivalent or better than the standard	1816 * common cases the performance is equivalent or better than the standard

1678 * case where we do not inspect the src alpha.	1817 * case where we do not inspect the src alpha.

1679 */	1818 */

1680 #if SK_A32_SHIFT == 24	1819 #if SK_A32_SHIFT == 24

1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1820 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1821 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1683 #else	1822 #else

1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1823 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1685 #endif	1824 #endif

1686 #ifdef SK_CPU_ARM32	1825 #ifdef SK_CPU_ARM32

1687 S32A_Blend_BlitRow32_neon // S32A_Blend	1826 S32A_Blend_BlitRow32_neon // S32A_Blend

1688 #else	1827 #else

1689 NULL	1828 NULL

1690 #endif	1829 #endif

1691 };	1830 };

OLD	NEW

« src/core/SkBlitter_RGB16.cpp ('K') | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »