OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 891 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
903 | 903 |
904 // Process src | 904 // Process src |
905 vsrc_wide = vmovl_u8(vsrc); | 905 vsrc_wide = vmovl_u8(vsrc); |
906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
907 | 907 |
908 // Process dst | 908 // Process dst |
909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
910 | 910 |
911 // Combine | 911 // Combine |
912 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 912 vdst_wide += vsrc_wide; |
| 913 vres = vshrn_n_u16(vdst_wide, 8); |
913 | 914 |
914 // Store | 915 // Store |
915 vst1_u32(dst, vreinterpret_u32_u8(vres)); | 916 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
916 | 917 |
917 src += 2; | 918 src += 2; |
918 dst += 2; | 919 dst += 2; |
919 count -= 2; | 920 count -= 2; |
920 } | 921 } |
921 | 922 |
922 if (count == 1) { | 923 if (count == 1) { |
923 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; | 924 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
924 uint16x8_t vsrc_wide, vdst_wide; | 925 uint16x8_t vsrc_wide, vdst_wide; |
925 | 926 |
926 // Load | 927 // Load |
927 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); | 928 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
928 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); | 929 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
929 | 930 |
930 // Process | 931 // Process |
931 vsrc_wide = vmovl_u8(vsrc); | 932 vsrc_wide = vmovl_u8(vsrc); |
932 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 933 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
933 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 934 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
934 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 935 vdst_wide += vsrc_wide; |
| 936 vres = vshrn_n_u16(vdst_wide, 8); |
935 | 937 |
936 // Store | 938 // Store |
937 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 939 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
938 } | 940 } |
939 } | 941 } |
940 | 942 |
941 #ifdef SK_CPU_ARM32 | 943 #ifdef SK_CPU_ARM32 |
942 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 944 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
943 const SkPMColor* SK_RESTRICT src, | 945 const SkPMColor* SK_RESTRICT src, |
944 int count, U8CPU alpha) { | 946 int count, U8CPU alpha) { |
(...skipping 12 matching lines...) Expand all Loading... |
957 uint16x8_t vdst_wide, vsrc_wide; | 959 uint16x8_t vdst_wide, vsrc_wide; |
958 unsigned dst_scale; | 960 unsigned dst_scale; |
959 | 961 |
960 // Load | 962 // Load |
961 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); | 963 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
962 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); | 964 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
963 | 965 |
964 // Calc dst_scale | 966 // Calc dst_scale |
965 dst_scale = vget_lane_u8(vsrc, 3); | 967 dst_scale = vget_lane_u8(vsrc, 3); |
966 dst_scale *= alpha256; | 968 dst_scale *= alpha256; |
| 969 dst_scale = (256<<8) - dst_scale; |
967 dst_scale >>= 8; | 970 dst_scale >>= 8; |
968 dst_scale = 256 - dst_scale; | |
969 | 971 |
970 // Process src | 972 // Process src |
971 vsrc_wide = vmovl_u8(vsrc); | 973 vsrc_wide = vmovl_u8(vsrc); |
972 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); | 974 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
973 | 975 |
974 // Process dst | 976 // Process dst |
975 vdst_wide = vmovl_u8(vdst); | 977 vdst_wide = vmovl_u8(vdst); |
976 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); | 978 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
977 | 979 |
978 // Combine | 980 // Combine |
979 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 981 vdst_wide += vsrc_wide; |
| 982 vres = vshrn_n_u16(vdst_wide, 8); |
980 | 983 |
981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 984 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
982 dst++; | 985 dst++; |
983 src++; | 986 src++; |
984 count--; | 987 count--; |
985 } | 988 } |
986 | 989 |
987 if (count) { | 990 if (count) { |
988 uint8x8_t alpha_mask; | 991 uint8x8_t alpha_mask; |
989 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 992 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
(...skipping 10 matching lines...) Expand all Loading... |
1000 // Load | 1003 // Load |
1001 vsrc = vreinterpret_u8_u32(vld1_u32(src)); | 1004 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
1002 vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 1005 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
1003 | 1006 |
1004 // Prepare src_scale | 1007 // Prepare src_scale |
1005 vsrc_scale = vdupq_n_u16(alpha256); | 1008 vsrc_scale = vdupq_n_u16(alpha256); |
1006 | 1009 |
1007 // Calc dst_scale | 1010 // Calc dst_scale |
1008 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); | 1011 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
1009 vdst_scale = vmovl_u8(vsrc_alphas); | 1012 vdst_scale = vmovl_u8(vsrc_alphas); |
1010 vdst_scale *= vsrc_scale; | 1013 vdst_scale = vmlsq_u16(vdupq_n_u16(255<<8), vdst_scale, vsrc_scale); |
1011 vdst_scale = vshrq_n_u16(vdst_scale, 8); | 1014 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); |
1012 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); | |
1013 | 1015 |
1014 // Process src | 1016 // Process src |
1015 vsrc_wide = vmovl_u8(vsrc); | 1017 vsrc_wide = vmovl_u8(vsrc); |
1016 vsrc_wide *= vsrc_scale; | 1018 vsrc_wide *= vsrc_scale; |
1017 | 1019 |
1018 // Process dst | 1020 // Process dst |
1019 vdst_wide = vmovl_u8(vdst); | 1021 vdst_wide = vmovl_u8(vdst); |
1020 vdst_wide *= vdst_scale; | 1022 vdst_wide *= vdst_scale; |
1021 | 1023 |
1022 // Combine | 1024 // Combine |
1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 1025 vdst_wide += vsrc_wide; |
| 1026 vres = vshrn_n_u16(vdst_wide, 8); |
1024 | 1027 |
1025 vst1_u32(dst, vreinterpret_u32_u8(vres)); | 1028 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
1026 | 1029 |
1027 src += 2; | 1030 src += 2; |
1028 dst += 2; | 1031 dst += 2; |
1029 count -= 2; | 1032 count -= 2; |
1030 } while(count); | 1033 } while(count); |
1031 } | 1034 } |
1032 } | 1035 } |
1033 | 1036 |
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1288 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1286 nullptr, // S32_Opaque, | 1289 nullptr, // S32_Opaque, |
1287 S32_Blend_BlitRow32_neon, // S32_Blend, | 1290 S32_Blend_BlitRow32_neon, // S32_Blend, |
1288 nullptr, // Ported to SkOpts | 1291 nullptr, // Ported to SkOpts |
1289 #ifdef SK_CPU_ARM32 | 1292 #ifdef SK_CPU_ARM32 |
1290 S32A_Blend_BlitRow32_neon // S32A_Blend | 1293 S32A_Blend_BlitRow32_neon // S32A_Blend |
1291 #else | 1294 #else |
1292 nullptr | 1295 nullptr |
1293 #endif | 1296 #endif |
1294 }; | 1297 }; |
OLD | NEW |