OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 853 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
864 | 864 |
865 uint16_t d = *dst; | 865 uint16_t d = *dst; |
866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
869 DITHER_INC_X(x); | 869 DITHER_INC_X(x); |
870 } while (--count != 0); | 870 } while (--count != 0); |
871 } | 871 } |
872 } | 872 } |
873 | 873 |
874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | |
875 const SkPMColor* SK_RESTRICT src, | |
876 int count, U8CPU alpha) { | |
877 | |
878 SkASSERT(255 == alpha); | |
879 if (count > 0) { | |
880 | |
881 | |
882 uint8x8_t alpha_mask; | |
883 | |
884 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | |
885 alpha_mask = vld1_u8(alpha_mask_setup); | |
886 | |
887 /* do the NEON unrolled code */ | |
888 #define UNROLL 4 | |
889 while (count >= UNROLL) { | |
890 uint8x8_t src_raw, dst_raw, dst_final; | |
891 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | |
892 | |
893 /* The two prefetches below may make the code slighlty | |
894 * slower for small values of count but are worth having | |
895 * in the general case. | |
896 */ | |
897 __builtin_prefetch(src+32); | |
898 __builtin_prefetch(dst+32); | |
899 | |
900 /* get the source */ | |
901 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | |
902 #if UNROLL > 2 | |
903 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | |
904 #endif | |
905 | |
906 /* get and hold the dst too */ | |
907 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | |
908 #if UNROLL > 2 | |
909 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | |
910 #endif | |
911 | |
912 /* 1st and 2nd bits of the unrolling */ | |
913 { | |
914 uint8x8_t dst_cooked; | |
915 uint16x8_t dst_wide; | |
916 uint8x8_t alpha_narrow; | |
917 uint16x8_t alpha_wide; | |
918 | |
919 /* get the alphas spread out properly */ | |
920 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | |
921 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
922 | |
923 /* spread the dest */ | |
924 dst_wide = vmovl_u8(dst_raw); | |
925 | |
926 /* alpha mul the dest */ | |
927 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
928 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
929 | |
930 /* sum -- ignoring any byte lane overflows */ | |
931 dst_final = vadd_u8(src_raw, dst_cooked); | |
932 } | |
933 | |
934 #if UNROLL > 2 | |
935 /* the 3rd and 4th bits of our unrolling */ | |
936 { | |
937 uint8x8_t dst_cooked; | |
938 uint16x8_t dst_wide; | |
939 uint8x8_t alpha_narrow; | |
940 uint16x8_t alpha_wide; | |
941 | |
942 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | |
943 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
944 | |
945 /* spread the dest */ | |
946 dst_wide = vmovl_u8(dst_raw_2); | |
947 | |
948 /* alpha mul the dest */ | |
949 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
950 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
951 | |
952 /* sum -- ignoring any byte lane overflows */ | |
953 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | |
954 } | |
955 #endif | |
956 | |
957 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); | |
958 #if UNROLL > 2 | |
959 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); | |
960 #endif | |
961 | |
962 src += UNROLL; | |
963 dst += UNROLL; | |
964 count -= UNROLL; | |
965 } | |
966 #undef UNROLL | |
967 | |
968 /* do any residual iterations */ | |
969 while (--count >= 0) { | |
970 *dst = SkPMSrcOver(*src, *dst); | |
971 src += 1; | |
972 dst += 1; | |
973 } | |
974 } | |
975 } | |
976 | |
977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, | |
978 const SkPMColor* SK_RESTRICT src, | |
979 int count, U8CPU alpha) { | |
980 SkASSERT(255 == alpha); | |
981 | |
982 if (count <= 0) | |
983 return; | |
984 | |
985 /* Use these to check if src is transparent or opaque */ | |
986 const unsigned int ALPHA_OPAQ = 0xFF000000; | |
987 const unsigned int ALPHA_TRANS = 0x00FFFFFF; | |
988 | |
989 #define UNROLL 4 | |
990 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); | |
991 const SkPMColor* SK_RESTRICT src_temp = src; | |
992 | |
993 /* set up the NEON variables */ | |
994 uint8x8_t alpha_mask; | |
995 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | |
996 alpha_mask = vld1_u8(alpha_mask_setup); | |
997 | |
998 uint8x8_t src_raw, dst_raw, dst_final; | |
999 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | |
1000 uint8x8_t dst_cooked; | |
1001 uint16x8_t dst_wide; | |
1002 uint8x8_t alpha_narrow; | |
1003 uint16x8_t alpha_wide; | |
1004 | |
1005 /* choose the first processing type */ | |
1006 if( src >= src_end) | |
1007 goto TAIL; | |
1008 if(*src <= ALPHA_TRANS) | |
1009 goto ALPHA_0; | |
1010 if(*src >= ALPHA_OPAQ) | |
1011 goto ALPHA_255; | |
1012 /* fall-thru */ | |
1013 | |
1014 ALPHA_1_TO_254: | |
1015 do { | |
1016 | |
1017 /* get the source */ | |
1018 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | |
1019 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | |
1020 | |
1021 /* get and hold the dst too */ | |
1022 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | |
1023 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | |
1024 | |
1025 | |
1026 /* get the alphas spread out properly */ | |
1027 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | |
1028 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
1029 /* we collapsed (255-a)+1 ... */ | |
1030 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
1031 | |
1032 /* spread the dest */ | |
1033 dst_wide = vmovl_u8(dst_raw); | |
1034 | |
1035 /* alpha mul the dest */ | |
1036 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
1037 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
1038 | |
1039 /* sum -- ignoring any byte lane overflows */ | |
1040 dst_final = vadd_u8(src_raw, dst_cooked); | |
1041 | |
1042 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | |
1043 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
1044 /* we collapsed (255-a)+1 ... */ | |
1045 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
1046 | |
1047 /* spread the dest */ | |
1048 dst_wide = vmovl_u8(dst_raw_2); | |
1049 | |
1050 /* alpha mul the dest */ | |
1051 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
1052 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
1053 | |
1054 /* sum -- ignoring any byte lane overflows */ | |
1055 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | |
1056 | |
1057 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); | |
1058 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); | |
1059 | |
1060 src += UNROLL; | |
1061 dst += UNROLL; | |
1062 | |
1063 /* if 2 of the next pixels aren't between 1 and 254 | |
1064 it might make sense to go to the optimized loops */ | |
1065 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_
OPAQ && src[1] >= ALPHA_OPAQ)) | |
1066 break; | |
1067 | |
1068 } while(src < src_end); | |
1069 | |
1070 if (src >= src_end) | |
1071 goto TAIL; | |
1072 | |
1073 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) | |
1074 goto ALPHA_255; | |
1075 | |
1076 /*fall-thru*/ | |
1077 | |
1078 ALPHA_0: | |
1079 | |
1080 /*In this state, we know the current alpha is 0 and | |
1081 we optimize for the next alpha also being zero. */ | |
1082 src_temp = src; //so we don't have to increment dst every time | |
1083 do { | |
1084 if(*(++src) > ALPHA_TRANS) | |
1085 break; | |
1086 if(*(++src) > ALPHA_TRANS) | |
1087 break; | |
1088 if(*(++src) > ALPHA_TRANS) | |
1089 break; | |
1090 if(*(++src) > ALPHA_TRANS) | |
1091 break; | |
1092 } while(src < src_end); | |
1093 | |
1094 dst += (src - src_temp); | |
1095 | |
1096 /* no longer alpha 0, so determine where to go next. */ | |
1097 if( src >= src_end) | |
1098 goto TAIL; | |
1099 if(*src >= ALPHA_OPAQ) | |
1100 goto ALPHA_255; | |
1101 else | |
1102 goto ALPHA_1_TO_254; | |
1103 | |
1104 ALPHA_255: | |
1105 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { | |
1106 dst[0]=src[0]; | |
1107 dst[1]=src[1]; | |
1108 dst[2]=src[2]; | |
1109 dst[3]=src[3]; | |
1110 src+=UNROLL; | |
1111 dst+=UNROLL; | |
1112 if(src >= src_end) | |
1113 goto TAIL; | |
1114 } | |
1115 | |
1116 //Handle remainder. | |
1117 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; | |
1118 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; | |
1119 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } | |
1120 } | |
1121 } | |
1122 | |
1123 if( src >= src_end) | |
1124 goto TAIL; | |
1125 if(*src <= ALPHA_TRANS) | |
1126 goto ALPHA_0; | |
1127 else | |
1128 goto ALPHA_1_TO_254; | |
1129 | |
1130 TAIL: | |
1131 /* do any residual iterations */ | |
1132 src_end += UNROLL + 1; //goto the real end | |
1133 while(src != src_end) { | |
1134 if( *src != 0 ) { | |
1135 if( *src >= ALPHA_OPAQ ) { | |
1136 *dst = *src; | |
1137 } | |
1138 else { | |
1139 *dst = SkPMSrcOver(*src, *dst); | |
1140 } | |
1141 } | |
1142 src++; | |
1143 dst++; | |
1144 } | |
1145 | |
1146 #undef UNROLL | |
1147 return; | |
1148 } | |
1149 | |
1150 /* Neon version of S32_Blend_BlitRow32() | 874 /* Neon version of S32_Blend_BlitRow32() |
1151 * portable version is in src/core/SkBlitRow_D32.cpp | 875 * portable version is in src/core/SkBlitRow_D32.cpp |
1152 */ | 876 */ |
1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
1154 const SkPMColor* SK_RESTRICT src, | 878 const SkPMColor* SK_RESTRICT src, |
1155 int count, U8CPU alpha) { | 879 int count, U8CPU alpha) { |
1156 SkASSERT(alpha <= 255); | 880 SkASSERT(alpha <= 255); |
1157 | 881 |
1158 if (count <= 0) { | 882 if (count <= 0) { |
1159 return; | 883 return; |
(...skipping 394 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { | 1278 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { |
1555 Color32A_D565_neon, // Color32_D565, | 1279 Color32A_D565_neon, // Color32_D565, |
1556 Color32A_D565_neon, // Color32A_D565, | 1280 Color32A_D565_neon, // Color32A_D565, |
1557 Color32A_D565_neon, // Color32_D565_Dither, | 1281 Color32A_D565_neon, // Color32_D565_Dither, |
1558 Color32A_D565_neon, // Color32A_D565_Dither | 1282 Color32A_D565_neon, // Color32A_D565_Dither |
1559 }; | 1283 }; |
1560 | 1284 |
1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1562 nullptr, // S32_Opaque, | 1286 nullptr, // S32_Opaque, |
1563 S32_Blend_BlitRow32_neon, // S32_Blend, | 1287 S32_Blend_BlitRow32_neon, // S32_Blend, |
1564 /* | 1288 nullptr, // Ported to SkOpts |
1565 * We have two choices for S32A_Opaque procs. The one reads the src alpha | |
1566 * value and attempts to optimize accordingly. The optimization is | |
1567 * sensitive to the source content and is not a win in all cases. For | |
1568 * example, if there are a lot of transitions between the alpha states, | |
1569 * the performance will almost certainly be worse. However, for many | |
1570 * common cases the performance is equivalent or better than the standard | |
1571 * case where we do not inspect the src alpha. | |
1572 */ | |
1573 #if SK_A32_SHIFT == 24 | |
1574 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | |
1575 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | |
1576 #else | |
1577 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | |
1578 #endif | |
1579 #ifdef SK_CPU_ARM32 | 1289 #ifdef SK_CPU_ARM32 |
1580 S32A_Blend_BlitRow32_neon // S32A_Blend | 1290 S32A_Blend_BlitRow32_neon // S32A_Blend |
1581 #else | 1291 #else |
1582 nullptr | 1292 nullptr |
1583 #endif | 1293 #endif |
1584 }; | 1294 }; |
OLD | NEW |