OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 891 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
903 | 903 |
904 // Process src | 904 // Process src |
905 vsrc_wide = vmovl_u8(vsrc); | 905 vsrc_wide = vmovl_u8(vsrc); |
906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
907 | 907 |
908 // Process dst | 908 // Process dst |
909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
910 | 910 |
911 // Combine | 911 // Combine |
| 912 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
912 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 913 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 914 #else |
| 915 vdst_wide += vsrc_wide; |
| 916 vres = vshrn_n_u16(vdst_wide, 8); |
| 917 #endif |
913 | 918 |
914 // Store | 919 // Store |
915 vst1_u32(dst, vreinterpret_u32_u8(vres)); | 920 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
916 | 921 |
917 src += 2; | 922 src += 2; |
918 dst += 2; | 923 dst += 2; |
919 count -= 2; | 924 count -= 2; |
920 } | 925 } |
921 | 926 |
922 if (count == 1) { | 927 if (count == 1) { |
923 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; | 928 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
924 uint16x8_t vsrc_wide, vdst_wide; | 929 uint16x8_t vsrc_wide, vdst_wide; |
925 | 930 |
926 // Load | 931 // Load |
927 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); | 932 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
928 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); | 933 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
929 | 934 |
930 // Process | 935 // Process |
931 vsrc_wide = vmovl_u8(vsrc); | 936 vsrc_wide = vmovl_u8(vsrc); |
932 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 937 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
933 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 938 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
| 939 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
934 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 940 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 941 #else |
| 942 vdst_wide += vsrc_wide; |
| 943 vres = vshrn_n_u16(vdst_wide, 8); |
| 944 #endif |
935 | 945 |
936 // Store | 946 // Store |
937 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 947 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
938 } | 948 } |
939 } | 949 } |
940 | 950 |
941 #ifdef SK_CPU_ARM32 | 951 #ifdef SK_CPU_ARM32 |
942 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 952 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
943 const SkPMColor* SK_RESTRICT src, | 953 const SkPMColor* SK_RESTRICT src, |
944 int count, U8CPU alpha) { | 954 int count, U8CPU alpha) { |
945 | 955 |
946 SkASSERT(255 >= alpha); | 956 SkASSERT(255 > alpha); |
947 | 957 |
948 if (count <= 0) { | 958 if (count <= 0) { |
949 return; | 959 return; |
950 } | 960 } |
951 | 961 |
952 unsigned alpha256 = SkAlpha255To256(alpha); | 962 unsigned alpha256 = SkAlpha255To256(alpha); |
953 | 963 |
954 // First deal with odd counts | 964 // First deal with odd counts |
955 if (count & 1) { | 965 if (count & 1) { |
956 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; | 966 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
957 uint16x8_t vdst_wide, vsrc_wide; | 967 uint16x8_t vdst_wide, vsrc_wide; |
958 unsigned dst_scale; | 968 unsigned dst_scale; |
959 | 969 |
960 // Load | 970 // Load |
961 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); | 971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
962 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); | 972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
963 | 973 |
964 // Calc dst_scale | 974 // Calc dst_scale |
965 dst_scale = vget_lane_u8(vsrc, 3); | 975 dst_scale = vget_lane_u8(vsrc, 3); |
966 dst_scale *= alpha256; | 976 dst_scale = SkAlphaMulInv256(dst_scale, alpha256); |
967 dst_scale >>= 8; | |
968 dst_scale = 256 - dst_scale; | |
969 | 977 |
970 // Process src | 978 // Process src |
971 vsrc_wide = vmovl_u8(vsrc); | 979 vsrc_wide = vmovl_u8(vsrc); |
972 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); | 980 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
973 | 981 |
974 // Process dst | 982 // Process dst |
975 vdst_wide = vmovl_u8(vdst); | 983 vdst_wide = vmovl_u8(vdst); |
976 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); | 984 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
977 | 985 |
978 // Combine | 986 // Combine |
| 987 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
979 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 988 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 989 #else |
| 990 vdst_wide += vsrc_wide; |
| 991 vres = vshrn_n_u16(vdst_wide, 8); |
| 992 #endif |
980 | 993 |
981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 994 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
982 dst++; | 995 dst++; |
983 src++; | 996 src++; |
984 count--; | 997 count--; |
985 } | 998 } |
986 | 999 |
987 if (count) { | 1000 if (count) { |
988 uint8x8_t alpha_mask; | 1001 uint8x8_t alpha_mask; |
989 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 1002 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
(...skipping 10 matching lines...) Expand all Loading... |
1000 // Load | 1013 // Load |
1001 vsrc = vreinterpret_u8_u32(vld1_u32(src)); | 1014 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
1002 vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 1015 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
1003 | 1016 |
1004 // Prepare src_scale | 1017 // Prepare src_scale |
1005 vsrc_scale = vdupq_n_u16(alpha256); | 1018 vsrc_scale = vdupq_n_u16(alpha256); |
1006 | 1019 |
1007 // Calc dst_scale | 1020 // Calc dst_scale |
1008 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); | 1021 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
1009 vdst_scale = vmovl_u8(vsrc_alphas); | 1022 vdst_scale = vmovl_u8(vsrc_alphas); |
| 1023 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
1010 vdst_scale *= vsrc_scale; | 1024 vdst_scale *= vsrc_scale; |
1011 vdst_scale = vshrq_n_u16(vdst_scale, 8); | 1025 vdst_scale = vshrq_n_u16(vdst_scale, 8); |
1012 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); | 1026 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
| 1027 #else |
| 1028 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). |
| 1029 // A 16-bit lane would overflow if we used 0xFFFF here, |
| 1030 // so use an approximation with 0xFF00 that is off by 1, |
| 1031 // and add back 1 after to get the correct value. |
| 1032 // This is valid if alpha256 <= 255. |
| 1033 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); |
| 1034 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); |
| 1035 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); |
| 1036 #endif |
1013 | 1037 |
1014 // Process src | 1038 // Process src |
1015 vsrc_wide = vmovl_u8(vsrc); | 1039 vsrc_wide = vmovl_u8(vsrc); |
1016 vsrc_wide *= vsrc_scale; | 1040 vsrc_wide *= vsrc_scale; |
1017 | 1041 |
1018 // Process dst | 1042 // Process dst |
1019 vdst_wide = vmovl_u8(vdst); | 1043 vdst_wide = vmovl_u8(vdst); |
1020 vdst_wide *= vdst_scale; | 1044 vdst_wide *= vdst_scale; |
1021 | 1045 |
1022 // Combine | 1046 // Combine |
| 1047 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 1048 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 1049 #else |
| 1050 vdst_wide += vsrc_wide; |
| 1051 vres = vshrn_n_u16(vdst_wide, 8); |
| 1052 #endif |
1024 | 1053 |
1025 vst1_u32(dst, vreinterpret_u32_u8(vres)); | 1054 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
1026 | 1055 |
1027 src += 2; | 1056 src += 2; |
1028 dst += 2; | 1057 dst += 2; |
1029 count -= 2; | 1058 count -= 2; |
1030 } while(count); | 1059 } while(count); |
1031 } | 1060 } |
1032 } | 1061 } |
1033 | 1062 |
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1314 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1286 nullptr, // S32_Opaque, | 1315 nullptr, // S32_Opaque, |
1287 S32_Blend_BlitRow32_neon, // S32_Blend, | 1316 S32_Blend_BlitRow32_neon, // S32_Blend, |
1288 nullptr, // Ported to SkOpts | 1317 nullptr, // Ported to SkOpts |
1289 #ifdef SK_CPU_ARM32 | 1318 #ifdef SK_CPU_ARM32 |
1290 S32A_Blend_BlitRow32_neon // S32A_Blend | 1319 S32A_Blend_BlitRow32_neon // S32A_Blend |
1291 #else | 1320 #else |
1292 nullptr | 1321 nullptr |
1293 #endif | 1322 #endif |
1294 }; | 1323 }; |
OLD | NEW |