Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(238)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 1820313002: Port S32A_opaque blit row to SkOpts. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: fmt Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 853 matching lines...) Expand 10 before | Expand all | Expand 10 after
864 864
865 uint16_t d = *dst; 865 uint16_t d = *dst;
866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 867 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 868 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869 DITHER_INC_X(x); 869 DITHER_INC_X(x);
870 } while (--count != 0); 870 } while (--count != 0);
871 } 871 }
872 } 872 }
873 873
874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
875 const SkPMColor* SK_RESTRICT src,
876 int count, U8CPU alpha) {
877
878 SkASSERT(255 == alpha);
879 if (count > 0) {
880
881
882 uint8x8_t alpha_mask;
883
884 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
885 alpha_mask = vld1_u8(alpha_mask_setup);
886
887 /* do the NEON unrolled code */
888 #define UNROLL 4
889 while (count >= UNROLL) {
890 uint8x8_t src_raw, dst_raw, dst_final;
891 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
892
893 /* The two prefetches below may make the code slighlty
894 * slower for small values of count but are worth having
895 * in the general case.
896 */
897 __builtin_prefetch(src+32);
898 __builtin_prefetch(dst+32);
899
900 /* get the source */
901 src_raw = vreinterpret_u8_u32(vld1_u32(src));
902 #if UNROLL > 2
903 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
904 #endif
905
906 /* get and hold the dst too */
907 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
908 #if UNROLL > 2
909 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
910 #endif
911
912 /* 1st and 2nd bits of the unrolling */
913 {
914 uint8x8_t dst_cooked;
915 uint16x8_t dst_wide;
916 uint8x8_t alpha_narrow;
917 uint16x8_t alpha_wide;
918
919 /* get the alphas spread out properly */
920 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
921 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
922
923 /* spread the dest */
924 dst_wide = vmovl_u8(dst_raw);
925
926 /* alpha mul the dest */
927 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
928 dst_cooked = vshrn_n_u16(dst_wide, 8);
929
930 /* sum -- ignoring any byte lane overflows */
931 dst_final = vadd_u8(src_raw, dst_cooked);
932 }
933
934 #if UNROLL > 2
935 /* the 3rd and 4th bits of our unrolling */
936 {
937 uint8x8_t dst_cooked;
938 uint16x8_t dst_wide;
939 uint8x8_t alpha_narrow;
940 uint16x8_t alpha_wide;
941
942 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
943 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
944
945 /* spread the dest */
946 dst_wide = vmovl_u8(dst_raw_2);
947
948 /* alpha mul the dest */
949 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
950 dst_cooked = vshrn_n_u16(dst_wide, 8);
951
952 /* sum -- ignoring any byte lane overflows */
953 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
954 }
955 #endif
956
957 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
958 #if UNROLL > 2
959 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
960 #endif
961
962 src += UNROLL;
963 dst += UNROLL;
964 count -= UNROLL;
965 }
966 #undef UNROLL
967
968 /* do any residual iterations */
969 while (--count >= 0) {
970 *dst = SkPMSrcOver(*src, *dst);
971 src += 1;
972 dst += 1;
973 }
974 }
975 }
976
977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
978 const SkPMColor* SK_RESTRICT src,
979 int count, U8CPU alpha) {
980 SkASSERT(255 == alpha);
981
982 if (count <= 0)
983 return;
984
985 /* Use these to check if src is transparent or opaque */
986 const unsigned int ALPHA_OPAQ = 0xFF000000;
987 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
988
989 #define UNROLL 4
990 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
991 const SkPMColor* SK_RESTRICT src_temp = src;
992
993 /* set up the NEON variables */
994 uint8x8_t alpha_mask;
995 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
996 alpha_mask = vld1_u8(alpha_mask_setup);
997
998 uint8x8_t src_raw, dst_raw, dst_final;
999 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
1000 uint8x8_t dst_cooked;
1001 uint16x8_t dst_wide;
1002 uint8x8_t alpha_narrow;
1003 uint16x8_t alpha_wide;
1004
1005 /* choose the first processing type */
1006 if( src >= src_end)
1007 goto TAIL;
1008 if(*src <= ALPHA_TRANS)
1009 goto ALPHA_0;
1010 if(*src >= ALPHA_OPAQ)
1011 goto ALPHA_255;
1012 /* fall-thru */
1013
1014 ALPHA_1_TO_254:
1015 do {
1016
1017 /* get the source */
1018 src_raw = vreinterpret_u8_u32(vld1_u32(src));
1019 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
1020
1021 /* get and hold the dst too */
1022 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1023 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
1024
1025
1026 /* get the alphas spread out properly */
1027 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
1028 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1029 /* we collapsed (255-a)+1 ... */
1030 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1031
1032 /* spread the dest */
1033 dst_wide = vmovl_u8(dst_raw);
1034
1035 /* alpha mul the dest */
1036 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1037 dst_cooked = vshrn_n_u16(dst_wide, 8);
1038
1039 /* sum -- ignoring any byte lane overflows */
1040 dst_final = vadd_u8(src_raw, dst_cooked);
1041
1042 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
1043 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1044 /* we collapsed (255-a)+1 ... */
1045 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1046
1047 /* spread the dest */
1048 dst_wide = vmovl_u8(dst_raw_2);
1049
1050 /* alpha mul the dest */
1051 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1052 dst_cooked = vshrn_n_u16(dst_wide, 8);
1053
1054 /* sum -- ignoring any byte lane overflows */
1055 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
1056
1057 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1058 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
1059
1060 src += UNROLL;
1061 dst += UNROLL;
1062
1063 /* if 2 of the next pixels aren't between 1 and 254
1064 it might make sense to go to the optimized loops */
1065 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_ OPAQ && src[1] >= ALPHA_OPAQ))
1066 break;
1067
1068 } while(src < src_end);
1069
1070 if (src >= src_end)
1071 goto TAIL;
1072
1073 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
1074 goto ALPHA_255;
1075
1076 /*fall-thru*/
1077
1078 ALPHA_0:
1079
1080 /*In this state, we know the current alpha is 0 and
1081 we optimize for the next alpha also being zero. */
1082 src_temp = src; //so we don't have to increment dst every time
1083 do {
1084 if(*(++src) > ALPHA_TRANS)
1085 break;
1086 if(*(++src) > ALPHA_TRANS)
1087 break;
1088 if(*(++src) > ALPHA_TRANS)
1089 break;
1090 if(*(++src) > ALPHA_TRANS)
1091 break;
1092 } while(src < src_end);
1093
1094 dst += (src - src_temp);
1095
1096 /* no longer alpha 0, so determine where to go next. */
1097 if( src >= src_end)
1098 goto TAIL;
1099 if(*src >= ALPHA_OPAQ)
1100 goto ALPHA_255;
1101 else
1102 goto ALPHA_1_TO_254;
1103
1104 ALPHA_255:
1105 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
1106 dst[0]=src[0];
1107 dst[1]=src[1];
1108 dst[2]=src[2];
1109 dst[3]=src[3];
1110 src+=UNROLL;
1111 dst+=UNROLL;
1112 if(src >= src_end)
1113 goto TAIL;
1114 }
1115
1116 //Handle remainder.
1117 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1118 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1119 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
1120 }
1121 }
1122
1123 if( src >= src_end)
1124 goto TAIL;
1125 if(*src <= ALPHA_TRANS)
1126 goto ALPHA_0;
1127 else
1128 goto ALPHA_1_TO_254;
1129
1130 TAIL:
1131 /* do any residual iterations */
1132 src_end += UNROLL + 1; //goto the real end
1133 while(src != src_end) {
1134 if( *src != 0 ) {
1135 if( *src >= ALPHA_OPAQ ) {
1136 *dst = *src;
1137 }
1138 else {
1139 *dst = SkPMSrcOver(*src, *dst);
1140 }
1141 }
1142 src++;
1143 dst++;
1144 }
1145
1146 #undef UNROLL
1147 return;
1148 }
1149
1150 /* Neon version of S32_Blend_BlitRow32() 874 /* Neon version of S32_Blend_BlitRow32()
1151 * portable version is in src/core/SkBlitRow_D32.cpp 875 * portable version is in src/core/SkBlitRow_D32.cpp
1152 */ 876 */
1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1154 const SkPMColor* SK_RESTRICT src, 878 const SkPMColor* SK_RESTRICT src,
1155 int count, U8CPU alpha) { 879 int count, U8CPU alpha) {
1156 SkASSERT(alpha <= 255); 880 SkASSERT(alpha <= 255);
1157 881
1158 if (count <= 0) { 882 if (count <= 0) {
1159 return; 883 return;
(...skipping 394 matching lines...) Expand 10 before | Expand all | Expand 10 after
1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { 1278 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1555 Color32A_D565_neon, // Color32_D565, 1279 Color32A_D565_neon, // Color32_D565,
1556 Color32A_D565_neon, // Color32A_D565, 1280 Color32A_D565_neon, // Color32A_D565,
1557 Color32A_D565_neon, // Color32_D565_Dither, 1281 Color32A_D565_neon, // Color32_D565_Dither,
1558 Color32A_D565_neon, // Color32A_D565_Dither 1282 Color32A_D565_neon, // Color32A_D565_Dither
1559 }; 1283 };
1560 1284
1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1562 nullptr, // S32_Opaque, 1286 nullptr, // S32_Opaque,
1563 S32_Blend_BlitRow32_neon, // S32_Blend, 1287 S32_Blend_BlitRow32_neon, // S32_Blend,
1564 /* 1288 nullptr, // Ported to SkOpts
1565 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1566 * value and attempts to optimize accordingly. The optimization is
1567 * sensitive to the source content and is not a win in all cases. For
1568 * example, if there are a lot of transitions between the alpha states,
1569 * the performance will almost certainly be worse. However, for many
1570 * common cases the performance is equivalent or better than the standard
1571 * case where we do not inspect the src alpha.
1572 */
1573 #if SK_A32_SHIFT == 24
1574 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1575 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1576 #else
1577 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1578 #endif
1579 #ifdef SK_CPU_ARM32 1289 #ifdef SK_CPU_ARM32
1580 S32A_Blend_BlitRow32_neon // S32A_Blend 1290 S32A_Blend_BlitRow32_neon // S32A_Blend
1581 #else 1291 #else
1582 nullptr 1292 nullptr
1583 #endif 1293 #endif
1584 }; 1294 };
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698