| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 853 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 864 | 864 |
| 865 uint16_t d = *dst; | 865 uint16_t d = *dst; |
| 866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
| 867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
| 868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
| 869 DITHER_INC_X(x); | 869 DITHER_INC_X(x); |
| 870 } while (--count != 0); | 870 } while (--count != 0); |
| 871 } | 871 } |
| 872 } | 872 } |
| 873 | 873 |
| 874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | |
| 875 const SkPMColor* SK_RESTRICT src, | |
| 876 int count, U8CPU alpha) { | |
| 877 | |
| 878 SkASSERT(255 == alpha); | |
| 879 if (count > 0) { | |
| 880 | |
| 881 | |
| 882 uint8x8_t alpha_mask; | |
| 883 | |
| 884 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | |
| 885 alpha_mask = vld1_u8(alpha_mask_setup); | |
| 886 | |
| 887 /* do the NEON unrolled code */ | |
| 888 #define UNROLL 4 | |
| 889 while (count >= UNROLL) { | |
| 890 uint8x8_t src_raw, dst_raw, dst_final; | |
| 891 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | |
| 892 | |
| 893 /* The two prefetches below may make the code slighlty | |
| 894 * slower for small values of count but are worth having | |
| 895 * in the general case. | |
| 896 */ | |
| 897 __builtin_prefetch(src+32); | |
| 898 __builtin_prefetch(dst+32); | |
| 899 | |
| 900 /* get the source */ | |
| 901 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | |
| 902 #if UNROLL > 2 | |
| 903 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | |
| 904 #endif | |
| 905 | |
| 906 /* get and hold the dst too */ | |
| 907 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | |
| 908 #if UNROLL > 2 | |
| 909 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | |
| 910 #endif | |
| 911 | |
| 912 /* 1st and 2nd bits of the unrolling */ | |
| 913 { | |
| 914 uint8x8_t dst_cooked; | |
| 915 uint16x8_t dst_wide; | |
| 916 uint8x8_t alpha_narrow; | |
| 917 uint16x8_t alpha_wide; | |
| 918 | |
| 919 /* get the alphas spread out properly */ | |
| 920 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | |
| 921 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
| 922 | |
| 923 /* spread the dest */ | |
| 924 dst_wide = vmovl_u8(dst_raw); | |
| 925 | |
| 926 /* alpha mul the dest */ | |
| 927 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
| 928 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
| 929 | |
| 930 /* sum -- ignoring any byte lane overflows */ | |
| 931 dst_final = vadd_u8(src_raw, dst_cooked); | |
| 932 } | |
| 933 | |
| 934 #if UNROLL > 2 | |
| 935 /* the 3rd and 4th bits of our unrolling */ | |
| 936 { | |
| 937 uint8x8_t dst_cooked; | |
| 938 uint16x8_t dst_wide; | |
| 939 uint8x8_t alpha_narrow; | |
| 940 uint16x8_t alpha_wide; | |
| 941 | |
| 942 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | |
| 943 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
| 944 | |
| 945 /* spread the dest */ | |
| 946 dst_wide = vmovl_u8(dst_raw_2); | |
| 947 | |
| 948 /* alpha mul the dest */ | |
| 949 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
| 950 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
| 951 | |
| 952 /* sum -- ignoring any byte lane overflows */ | |
| 953 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | |
| 954 } | |
| 955 #endif | |
| 956 | |
| 957 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); | |
| 958 #if UNROLL > 2 | |
| 959 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); | |
| 960 #endif | |
| 961 | |
| 962 src += UNROLL; | |
| 963 dst += UNROLL; | |
| 964 count -= UNROLL; | |
| 965 } | |
| 966 #undef UNROLL | |
| 967 | |
| 968 /* do any residual iterations */ | |
| 969 while (--count >= 0) { | |
| 970 *dst = SkPMSrcOver(*src, *dst); | |
| 971 src += 1; | |
| 972 dst += 1; | |
| 973 } | |
| 974 } | |
| 975 } | |
| 976 | |
| 977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, | |
| 978 const SkPMColor* SK_RESTRICT src, | |
| 979 int count, U8CPU alpha) { | |
| 980 SkASSERT(255 == alpha); | |
| 981 | |
| 982 if (count <= 0) | |
| 983 return; | |
| 984 | |
| 985 /* Use these to check if src is transparent or opaque */ | |
| 986 const unsigned int ALPHA_OPAQ = 0xFF000000; | |
| 987 const unsigned int ALPHA_TRANS = 0x00FFFFFF; | |
| 988 | |
| 989 #define UNROLL 4 | |
| 990 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); | |
| 991 const SkPMColor* SK_RESTRICT src_temp = src; | |
| 992 | |
| 993 /* set up the NEON variables */ | |
| 994 uint8x8_t alpha_mask; | |
| 995 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | |
| 996 alpha_mask = vld1_u8(alpha_mask_setup); | |
| 997 | |
| 998 uint8x8_t src_raw, dst_raw, dst_final; | |
| 999 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | |
| 1000 uint8x8_t dst_cooked; | |
| 1001 uint16x8_t dst_wide; | |
| 1002 uint8x8_t alpha_narrow; | |
| 1003 uint16x8_t alpha_wide; | |
| 1004 | |
| 1005 /* choose the first processing type */ | |
| 1006 if( src >= src_end) | |
| 1007 goto TAIL; | |
| 1008 if(*src <= ALPHA_TRANS) | |
| 1009 goto ALPHA_0; | |
| 1010 if(*src >= ALPHA_OPAQ) | |
| 1011 goto ALPHA_255; | |
| 1012 /* fall-thru */ | |
| 1013 | |
| 1014 ALPHA_1_TO_254: | |
| 1015 do { | |
| 1016 | |
| 1017 /* get the source */ | |
| 1018 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | |
| 1019 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | |
| 1020 | |
| 1021 /* get and hold the dst too */ | |
| 1022 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | |
| 1023 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | |
| 1024 | |
| 1025 | |
| 1026 /* get the alphas spread out properly */ | |
| 1027 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | |
| 1028 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
| 1029 /* we collapsed (255-a)+1 ... */ | |
| 1030 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
| 1031 | |
| 1032 /* spread the dest */ | |
| 1033 dst_wide = vmovl_u8(dst_raw); | |
| 1034 | |
| 1035 /* alpha mul the dest */ | |
| 1036 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
| 1037 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
| 1038 | |
| 1039 /* sum -- ignoring any byte lane overflows */ | |
| 1040 dst_final = vadd_u8(src_raw, dst_cooked); | |
| 1041 | |
| 1042 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | |
| 1043 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
| 1044 /* we collapsed (255-a)+1 ... */ | |
| 1045 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | |
| 1046 | |
| 1047 /* spread the dest */ | |
| 1048 dst_wide = vmovl_u8(dst_raw_2); | |
| 1049 | |
| 1050 /* alpha mul the dest */ | |
| 1051 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | |
| 1052 dst_cooked = vshrn_n_u16(dst_wide, 8); | |
| 1053 | |
| 1054 /* sum -- ignoring any byte lane overflows */ | |
| 1055 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | |
| 1056 | |
| 1057 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); | |
| 1058 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); | |
| 1059 | |
| 1060 src += UNROLL; | |
| 1061 dst += UNROLL; | |
| 1062 | |
| 1063 /* if 2 of the next pixels aren't between 1 and 254 | |
| 1064 it might make sense to go to the optimized loops */ | |
| 1065 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_
OPAQ && src[1] >= ALPHA_OPAQ)) | |
| 1066 break; | |
| 1067 | |
| 1068 } while(src < src_end); | |
| 1069 | |
| 1070 if (src >= src_end) | |
| 1071 goto TAIL; | |
| 1072 | |
| 1073 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) | |
| 1074 goto ALPHA_255; | |
| 1075 | |
| 1076 /*fall-thru*/ | |
| 1077 | |
| 1078 ALPHA_0: | |
| 1079 | |
| 1080 /*In this state, we know the current alpha is 0 and | |
| 1081 we optimize for the next alpha also being zero. */ | |
| 1082 src_temp = src; //so we don't have to increment dst every time | |
| 1083 do { | |
| 1084 if(*(++src) > ALPHA_TRANS) | |
| 1085 break; | |
| 1086 if(*(++src) > ALPHA_TRANS) | |
| 1087 break; | |
| 1088 if(*(++src) > ALPHA_TRANS) | |
| 1089 break; | |
| 1090 if(*(++src) > ALPHA_TRANS) | |
| 1091 break; | |
| 1092 } while(src < src_end); | |
| 1093 | |
| 1094 dst += (src - src_temp); | |
| 1095 | |
| 1096 /* no longer alpha 0, so determine where to go next. */ | |
| 1097 if( src >= src_end) | |
| 1098 goto TAIL; | |
| 1099 if(*src >= ALPHA_OPAQ) | |
| 1100 goto ALPHA_255; | |
| 1101 else | |
| 1102 goto ALPHA_1_TO_254; | |
| 1103 | |
| 1104 ALPHA_255: | |
| 1105 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { | |
| 1106 dst[0]=src[0]; | |
| 1107 dst[1]=src[1]; | |
| 1108 dst[2]=src[2]; | |
| 1109 dst[3]=src[3]; | |
| 1110 src+=UNROLL; | |
| 1111 dst+=UNROLL; | |
| 1112 if(src >= src_end) | |
| 1113 goto TAIL; | |
| 1114 } | |
| 1115 | |
| 1116 //Handle remainder. | |
| 1117 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; | |
| 1118 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; | |
| 1119 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } | |
| 1120 } | |
| 1121 } | |
| 1122 | |
| 1123 if( src >= src_end) | |
| 1124 goto TAIL; | |
| 1125 if(*src <= ALPHA_TRANS) | |
| 1126 goto ALPHA_0; | |
| 1127 else | |
| 1128 goto ALPHA_1_TO_254; | |
| 1129 | |
| 1130 TAIL: | |
| 1131 /* do any residual iterations */ | |
| 1132 src_end += UNROLL + 1; //goto the real end | |
| 1133 while(src != src_end) { | |
| 1134 if( *src != 0 ) { | |
| 1135 if( *src >= ALPHA_OPAQ ) { | |
| 1136 *dst = *src; | |
| 1137 } | |
| 1138 else { | |
| 1139 *dst = SkPMSrcOver(*src, *dst); | |
| 1140 } | |
| 1141 } | |
| 1142 src++; | |
| 1143 dst++; | |
| 1144 } | |
| 1145 | |
| 1146 #undef UNROLL | |
| 1147 return; | |
| 1148 } | |
| 1149 | |
| 1150 /* Neon version of S32_Blend_BlitRow32() | 874 /* Neon version of S32_Blend_BlitRow32() |
| 1151 * portable version is in src/core/SkBlitRow_D32.cpp | 875 * portable version is in src/core/SkBlitRow_D32.cpp |
| 1152 */ | 876 */ |
| 1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 1154 const SkPMColor* SK_RESTRICT src, | 878 const SkPMColor* SK_RESTRICT src, |
| 1155 int count, U8CPU alpha) { | 879 int count, U8CPU alpha) { |
| 1156 SkASSERT(alpha <= 255); | 880 SkASSERT(alpha <= 255); |
| 1157 | 881 |
| 1158 if (count <= 0) { | 882 if (count <= 0) { |
| 1159 return; | 883 return; |
| (...skipping 394 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { | 1278 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { |
| 1555 Color32A_D565_neon, // Color32_D565, | 1279 Color32A_D565_neon, // Color32_D565, |
| 1556 Color32A_D565_neon, // Color32A_D565, | 1280 Color32A_D565_neon, // Color32A_D565, |
| 1557 Color32A_D565_neon, // Color32_D565_Dither, | 1281 Color32A_D565_neon, // Color32_D565_Dither, |
| 1558 Color32A_D565_neon, // Color32A_D565_Dither | 1282 Color32A_D565_neon, // Color32A_D565_Dither |
| 1559 }; | 1283 }; |
| 1560 | 1284 |
| 1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
| 1562 nullptr, // S32_Opaque, | 1286 nullptr, // S32_Opaque, |
| 1563 S32_Blend_BlitRow32_neon, // S32_Blend, | 1287 S32_Blend_BlitRow32_neon, // S32_Blend, |
| 1564 /* | 1288 nullptr, // Ported to SkOpts |
| 1565 * We have two choices for S32A_Opaque procs. The one reads the src alpha | |
| 1566 * value and attempts to optimize accordingly. The optimization is | |
| 1567 * sensitive to the source content and is not a win in all cases. For | |
| 1568 * example, if there are a lot of transitions between the alpha states, | |
| 1569 * the performance will almost certainly be worse. However, for many | |
| 1570 * common cases the performance is equivalent or better than the standard | |
| 1571 * case where we do not inspect the src alpha. | |
| 1572 */ | |
| 1573 #if SK_A32_SHIFT == 24 | |
| 1574 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | |
| 1575 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | |
| 1576 #else | |
| 1577 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | |
| 1578 #endif | |
| 1579 #ifdef SK_CPU_ARM32 | 1289 #ifdef SK_CPU_ARM32 |
| 1580 S32A_Blend_BlitRow32_neon // S32A_Blend | 1290 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1581 #else | 1291 #else |
| 1582 nullptr | 1292 nullptr |
| 1583 #endif | 1293 #endif |
| 1584 }; | 1294 }; |
| OLD | NEW |