src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 1820313002: Port S32A_opaque blit row to SkOpts.

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 1820313002: Port S32A_opaque blit row to SkOpts. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: fmt Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 853 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
864	864

865 uint16_t d = *dst;	865 uint16_t d = *dst;

866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),	866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

867 SkAlphaBlend(sg, SkGetPackedG16(d), scale),	867 SkAlphaBlend(sg, SkGetPackedG16(d), scale),

868 SkAlphaBlend(sb, SkGetPackedB16(d), scale));	868 SkAlphaBlend(sb, SkGetPackedB16(d), scale));

869 DITHER_INC_X(x);	869 DITHER_INC_X(x);

870 } while (--count != 0);	870 } while (--count != 0);

871 }	871 }

872 }	872 }

873	873

874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

875 const SkPMColor* SK_RESTRICT src,

876 int count, U8CPU alpha) {

877

878 SkASSERT(255 == alpha);

879 if (count > 0) {

880

881

882 uint8x8_t alpha_mask;

883

884 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

885 alpha_mask = vld1_u8(alpha_mask_setup);

886

887 /* do the NEON unrolled code */

888 #define UNROLL 4

889 while (count >= UNROLL) {

890 uint8x8_t src_raw, dst_raw, dst_final;

891 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

892

893 /* The two prefetches below may make the code slighlty

894 * slower for small values of count but are worth having

895 * in the general case.

896 */

897 __builtin_prefetch(src+32);

898 __builtin_prefetch(dst+32);

899

900 /* get the source */

901 src_raw = vreinterpret_u8_u32(vld1_u32(src));

902 #if UNROLL > 2

903 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

904 #endif

905

906 /* get and hold the dst too */

907 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

908 #if UNROLL > 2

909 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

910 #endif

911

912 /* 1st and 2nd bits of the unrolling */

913 {

914 uint8x8_t dst_cooked;

915 uint16x8_t dst_wide;

916 uint8x8_t alpha_narrow;

917 uint16x8_t alpha_wide;

918

919 /* get the alphas spread out properly */

920 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

921 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

922

923 /* spread the dest */

924 dst_wide = vmovl_u8(dst_raw);

925

926 /* alpha mul the dest */

927 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

928 dst_cooked = vshrn_n_u16(dst_wide, 8);

929

930 /* sum -- ignoring any byte lane overflows */

931 dst_final = vadd_u8(src_raw, dst_cooked);

932 }

933

934 #if UNROLL > 2

935 /* the 3rd and 4th bits of our unrolling */

936 {

937 uint8x8_t dst_cooked;

938 uint16x8_t dst_wide;

939 uint8x8_t alpha_narrow;

940 uint16x8_t alpha_wide;

941

942 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

943 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

944

945 /* spread the dest */

946 dst_wide = vmovl_u8(dst_raw_2);

947

948 /* alpha mul the dest */

949 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

950 dst_cooked = vshrn_n_u16(dst_wide, 8);

951

952 /* sum -- ignoring any byte lane overflows */

953 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

954 }

955 #endif

956

957 vst1_u32(dst, vreinterpret_u32_u8(dst_final));

958 #if UNROLL > 2

959 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

960 #endif

961

962 src += UNROLL;

963 dst += UNROLL;

964 count -= UNROLL;

965 }

966 #undef UNROLL

967

968 /* do any residual iterations */

969 while (--count >= 0) {

970 dst = SkPMSrcOver(src, *dst);

971 src += 1;

972 dst += 1;

973 }

974 }

975 }

976

977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,

978 const SkPMColor* SK_RESTRICT src,

979 int count, U8CPU alpha) {

980 SkASSERT(255 == alpha);

981

982 if (count <= 0)

983 return;

984

985 /* Use these to check if src is transparent or opaque */

986 const unsigned int ALPHA_OPAQ = 0xFF000000;

987 const unsigned int ALPHA_TRANS = 0x00FFFFFF;

988

989 #define UNROLL 4

990 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);

991 const SkPMColor* SK_RESTRICT src_temp = src;

992

993 /* set up the NEON variables */

994 uint8x8_t alpha_mask;

995 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

996 alpha_mask = vld1_u8(alpha_mask_setup);

997

998 uint8x8_t src_raw, dst_raw, dst_final;

999 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

1000 uint8x8_t dst_cooked;

1001 uint16x8_t dst_wide;

1002 uint8x8_t alpha_narrow;

1003 uint16x8_t alpha_wide;

1004

1005 /* choose the first processing type */

1006 if( src >= src_end)

1007 goto TAIL;

1008 if(*src <= ALPHA_TRANS)

1009 goto ALPHA_0;

1010 if(*src >= ALPHA_OPAQ)

1011 goto ALPHA_255;

1012 /* fall-thru */

1013

1014 ALPHA_1_TO_254:

1015 do {

1016

1017 /* get the source */

1018 src_raw = vreinterpret_u8_u32(vld1_u32(src));

1019 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

1020

1021 /* get and hold the dst too */

1022 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

1023 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

1024

1025

1026 /* get the alphas spread out properly */

1027 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

1028 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

1029 /* we collapsed (255-a)+1 ... */

1030 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

1031

1032 /* spread the dest */

1033 dst_wide = vmovl_u8(dst_raw);

1034

1035 /* alpha mul the dest */

1036 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

1037 dst_cooked = vshrn_n_u16(dst_wide, 8);

1038

1039 /* sum -- ignoring any byte lane overflows */

1040 dst_final = vadd_u8(src_raw, dst_cooked);

1041

1042 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

1043 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

1044 /* we collapsed (255-a)+1 ... */

1045 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

1046

1047 /* spread the dest */

1048 dst_wide = vmovl_u8(dst_raw_2);

1049

1050 /* alpha mul the dest */

1051 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

1052 dst_cooked = vshrn_n_u16(dst_wide, 8);

1053

1054 /* sum -- ignoring any byte lane overflows */

1055 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

1056

1057 vst1_u32(dst, vreinterpret_u32_u8(dst_final));

1058 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

1059

1060 src += UNROLL;

1061 dst += UNROLL;

1062

1063 /* if 2 of the next pixels aren't between 1 and 254

1064 it might make sense to go to the optimized loops */

1065 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) \|\| (src[0] >= ALPHA_ OPAQ && src[1] >= ALPHA_OPAQ))

1066 break;

1067

1068 } while(src < src_end);

1069

1070 if (src >= src_end)

1071 goto TAIL;

1072

1073 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)

1074 goto ALPHA_255;

1075

1076 /fall-thru/

1077

1078 ALPHA_0:

1079

1080 /*In this state, we know the current alpha is 0 and

1081 we optimize for the next alpha also being zero. */

1082 src_temp = src; //so we don't have to increment dst every time

1083 do {

1084 if(*(++src) > ALPHA_TRANS)

1085 break;

1086 if(*(++src) > ALPHA_TRANS)

1087 break;

1088 if(*(++src) > ALPHA_TRANS)

1089 break;

1090 if(*(++src) > ALPHA_TRANS)

1091 break;

1092 } while(src < src_end);

1093

1094 dst += (src - src_temp);

1095

1096 /* no longer alpha 0, so determine where to go next. */

1097 if( src >= src_end)

1098 goto TAIL;

1099 if(*src >= ALPHA_OPAQ)

1100 goto ALPHA_255;

1101 else

1102 goto ALPHA_1_TO_254;

1103

1104 ALPHA_255:

1105 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {

1106 dst[0]=src[0];

1107 dst[1]=src[1];

1108 dst[2]=src[2];

1109 dst[3]=src[3];

1110 src+=UNROLL;

1111 dst+=UNROLL;

1112 if(src >= src_end)

1113 goto TAIL;

1114 }

1115

1116 //Handle remainder.

1117 if(src >= ALPHA_OPAQ) { dst++ = *src++;

1118 if(src >= ALPHA_OPAQ) { dst++ = *src++;

1119 if(src >= ALPHA_OPAQ) { dst++ = *src++; }

1120 }

1121 }

1122

1123 if( src >= src_end)

1124 goto TAIL;

1125 if(*src <= ALPHA_TRANS)

1126 goto ALPHA_0;

1127 else

1128 goto ALPHA_1_TO_254;

1129

1130 TAIL:

1131 /* do any residual iterations */

1132 src_end += UNROLL + 1; //goto the real end

1133 while(src != src_end) {

1134 if( *src != 0 ) {

1135 if( *src >= ALPHA_OPAQ ) {

1136 dst = src;

1137 }

1138 else {

1139 dst = SkPMSrcOver(src, *dst);

1140 }

1141 }

1142 src++;

1143 dst++;

1144 }

1145

1146 #undef UNROLL

1147 return;

1148 }

1149

1150 /* Neon version of S32_Blend_BlitRow32()	874 /* Neon version of S32_Blend_BlitRow32()

1151 * portable version is in src/core/SkBlitRow_D32.cpp	875 * portable version is in src/core/SkBlitRow_D32.cpp

1152 */	876 */

1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

1154 const SkPMColor* SK_RESTRICT src,	878 const SkPMColor* SK_RESTRICT src,

1155 int count, U8CPU alpha) {	879 int count, U8CPU alpha) {

1156 SkASSERT(alpha <= 255);	880 SkASSERT(alpha <= 255);

1157	881

1158 if (count <= 0) {	882 if (count <= 0) {

1159 return;	883 return;

(...skipping 394 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {	1278 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {

1555 Color32A_D565_neon, // Color32_D565,	1279 Color32A_D565_neon, // Color32_D565,

1556 Color32A_D565_neon, // Color32A_D565,	1280 Color32A_D565_neon, // Color32A_D565,

1557 Color32A_D565_neon, // Color32_D565_Dither,	1281 Color32A_D565_neon, // Color32_D565_Dither,

1558 Color32A_D565_neon, // Color32A_D565_Dither	1282 Color32A_D565_neon, // Color32A_D565_Dither

1559 };	1283 };

1560	1284

1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1562 nullptr, // S32_Opaque,	1286 nullptr, // S32_Opaque,

1563 S32_Blend_BlitRow32_neon, // S32_Blend,	1287 S32_Blend_BlitRow32_neon, // S32_Blend,

1564 /*	1288 nullptr, // Ported to SkOpts

1565 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1566 * value and attempts to optimize accordingly. The optimization is

1567 * sensitive to the source content and is not a win in all cases. For

1568 * example, if there are a lot of transitions between the alpha states,

1569 * the performance will almost certainly be worse. However, for many

1570 * common cases the performance is equivalent or better than the standard

1571 * case where we do not inspect the src alpha.

1572 */

1573 #if SK_A32_SHIFT == 24

1574 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1575 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1576 #else

1577 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1578 #endif

1579 #ifdef SK_CPU_ARM32	1289 #ifdef SK_CPU_ARM32

1580 S32A_Blend_BlitRow32_neon // S32A_Blend	1290 S32A_Blend_BlitRow32_neon // S32A_Blend

1581 #else	1291 #else

1582 nullptr	1292 nullptr

1583 #endif	1293 #endif

1584 };	1294 };

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.cpp ('k') | src/opts/SkOpts_neon.cpp » ('j') | src/opts/SkOpts_sse41.cpp » ('J')