Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(148)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its… (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: guard more changes with SK_SUPPORT_LEGACY_BROKEN_LERP Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBlitRow_opts_SSE2.cpp ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 891 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 902 vdst = vreinterpret_u8_u32(vld1_u32(dst));
903 903
904 // Process src 904 // Process src
905 vsrc_wide = vmovl_u8(vsrc); 905 vsrc_wide = vmovl_u8(vsrc);
906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907 907
908 // Process dst 908 // Process dst
909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910 910
911 // Combine 911 // Combine
912 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
912 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 913 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
914 #else
915 vdst_wide += vsrc_wide;
916 vres = vshrn_n_u16(vdst_wide, 8);
917 #endif
913 918
914 // Store 919 // Store
915 vst1_u32(dst, vreinterpret_u32_u8(vres)); 920 vst1_u32(dst, vreinterpret_u32_u8(vres));
916 921
917 src += 2; 922 src += 2;
918 dst += 2; 923 dst += 2;
919 count -= 2; 924 count -= 2;
920 } 925 }
921 926
922 if (count == 1) { 927 if (count == 1) {
923 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 928 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
924 uint16x8_t vsrc_wide, vdst_wide; 929 uint16x8_t vsrc_wide, vdst_wide;
925 930
926 // Load 931 // Load
927 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 932 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
928 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 933 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
929 934
930 // Process 935 // Process
931 vsrc_wide = vmovl_u8(vsrc); 936 vsrc_wide = vmovl_u8(vsrc);
932 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 937 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
933 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 938 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
939 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
934 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 940 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
941 #else
942 vdst_wide += vsrc_wide;
943 vres = vshrn_n_u16(vdst_wide, 8);
944 #endif
935 945
936 // Store 946 // Store
937 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 947 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
938 } 948 }
939 } 949 }
940 950
941 #ifdef SK_CPU_ARM32 951 #ifdef SK_CPU_ARM32
942 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 952 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
943 const SkPMColor* SK_RESTRICT src, 953 const SkPMColor* SK_RESTRICT src,
944 int count, U8CPU alpha) { 954 int count, U8CPU alpha) {
945 955
946 SkASSERT(255 >= alpha); 956 SkASSERT(255 > alpha);
947 957
948 if (count <= 0) { 958 if (count <= 0) {
949 return; 959 return;
950 } 960 }
951 961
952 unsigned alpha256 = SkAlpha255To256(alpha); 962 unsigned alpha256 = SkAlpha255To256(alpha);
953 963
954 // First deal with odd counts 964 // First deal with odd counts
955 if (count & 1) { 965 if (count & 1) {
956 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 966 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
957 uint16x8_t vdst_wide, vsrc_wide; 967 uint16x8_t vdst_wide, vsrc_wide;
958 unsigned dst_scale; 968 unsigned dst_scale;
959 969
960 // Load 970 // Load
961 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
962 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
963 973
964 // Calc dst_scale 974 // Calc dst_scale
965 dst_scale = vget_lane_u8(vsrc, 3); 975 dst_scale = vget_lane_u8(vsrc, 3);
966 dst_scale *= alpha256; 976 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
967 dst_scale >>= 8;
968 dst_scale = 256 - dst_scale;
969 977
970 // Process src 978 // Process src
971 vsrc_wide = vmovl_u8(vsrc); 979 vsrc_wide = vmovl_u8(vsrc);
972 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 980 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
973 981
974 // Process dst 982 // Process dst
975 vdst_wide = vmovl_u8(vdst); 983 vdst_wide = vmovl_u8(vdst);
976 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 984 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
977 985
978 // Combine 986 // Combine
987 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
979 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 988 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
989 #else
990 vdst_wide += vsrc_wide;
991 vres = vshrn_n_u16(vdst_wide, 8);
992 #endif
980 993
981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 994 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
982 dst++; 995 dst++;
983 src++; 996 src++;
984 count--; 997 count--;
985 } 998 }
986 999
987 if (count) { 1000 if (count) {
988 uint8x8_t alpha_mask; 1001 uint8x8_t alpha_mask;
989 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1002 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
(...skipping 10 matching lines...) Expand all
1000 // Load 1013 // Load
1001 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1014 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1002 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1015 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1003 1016
1004 // Prepare src_scale 1017 // Prepare src_scale
1005 vsrc_scale = vdupq_n_u16(alpha256); 1018 vsrc_scale = vdupq_n_u16(alpha256);
1006 1019
1007 // Calc dst_scale 1020 // Calc dst_scale
1008 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1021 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1009 vdst_scale = vmovl_u8(vsrc_alphas); 1022 vdst_scale = vmovl_u8(vsrc_alphas);
1023 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1010 vdst_scale *= vsrc_scale; 1024 vdst_scale *= vsrc_scale;
1011 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1025 vdst_scale = vshrq_n_u16(vdst_scale, 8);
1012 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1026 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1027 #else
1028 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
1029 // A 16-bit lane would overflow if we used 0xFFFF here,
1030 // so use an approximation with 0xFF00 that is off by 1,
1031 // and add back 1 after to get the correct value.
1032 // This is valid if alpha256 <= 255.
1033 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
1034 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
1035 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
1036 #endif
1013 1037
1014 // Process src 1038 // Process src
1015 vsrc_wide = vmovl_u8(vsrc); 1039 vsrc_wide = vmovl_u8(vsrc);
1016 vsrc_wide *= vsrc_scale; 1040 vsrc_wide *= vsrc_scale;
1017 1041
1018 // Process dst 1042 // Process dst
1019 vdst_wide = vmovl_u8(vdst); 1043 vdst_wide = vmovl_u8(vdst);
1020 vdst_wide *= vdst_scale; 1044 vdst_wide *= vdst_scale;
1021 1045
1022 // Combine 1046 // Combine
1047 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1048 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1049 #else
1050 vdst_wide += vsrc_wide;
1051 vres = vshrn_n_u16(vdst_wide, 8);
1052 #endif
1024 1053
1025 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1054 vst1_u32(dst, vreinterpret_u32_u8(vres));
1026 1055
1027 src += 2; 1056 src += 2;
1028 dst += 2; 1057 dst += 2;
1029 count -= 2; 1058 count -= 2;
1030 } while(count); 1059 } while(count);
1031 } 1060 }
1032 } 1061 }
1033 1062
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after
1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1314 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1286 nullptr, // S32_Opaque, 1315 nullptr, // S32_Opaque,
1287 S32_Blend_BlitRow32_neon, // S32_Blend, 1316 S32_Blend_BlitRow32_neon, // S32_Blend,
1288 nullptr, // Ported to SkOpts 1317 nullptr, // Ported to SkOpts
1289 #ifdef SK_CPU_ARM32 1318 #ifdef SK_CPU_ARM32
1290 S32A_Blend_BlitRow32_neon // S32A_Blend 1319 S32A_Blend_BlitRow32_neon // S32A_Blend
1291 #else 1320 #else
1292 nullptr 1321 nullptr
1293 #endif 1322 #endif
1294 }; 1323 };
OLDNEW
« no previous file with comments | « src/opts/SkBlitRow_opts_SSE2.cpp ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698