Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(176)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its… (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: Fix overflow in destination scale calculation Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 891 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 902 vdst = vreinterpret_u8_u32(vld1_u32(dst));
903 903
904 // Process src 904 // Process src
905 vsrc_wide = vmovl_u8(vsrc); 905 vsrc_wide = vmovl_u8(vsrc);
906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907 907
908 // Process dst 908 // Process dst
909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910 910
911 // Combine 911 // Combine
912 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 912 vdst_wide += vsrc_wide;
913 vres = vshrn_n_u16(vdst_wide, 8);
913 914
914 // Store 915 // Store
915 vst1_u32(dst, vreinterpret_u32_u8(vres)); 916 vst1_u32(dst, vreinterpret_u32_u8(vres));
916 917
917 src += 2; 918 src += 2;
918 dst += 2; 919 dst += 2;
919 count -= 2; 920 count -= 2;
920 } 921 }
921 922
922 if (count == 1) { 923 if (count == 1) {
923 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 924 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
924 uint16x8_t vsrc_wide, vdst_wide; 925 uint16x8_t vsrc_wide, vdst_wide;
925 926
926 // Load 927 // Load
927 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 928 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
928 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 929 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
929 930
930 // Process 931 // Process
931 vsrc_wide = vmovl_u8(vsrc); 932 vsrc_wide = vmovl_u8(vsrc);
932 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 933 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
933 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 934 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
934 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 935 vdst_wide += vsrc_wide;
936 vres = vshrn_n_u16(vdst_wide, 8);
935 937
936 // Store 938 // Store
937 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 939 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
938 } 940 }
939 } 941 }
940 942
941 #ifdef SK_CPU_ARM32 943 #ifdef SK_CPU_ARM32
942 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 944 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
943 const SkPMColor* SK_RESTRICT src, 945 const SkPMColor* SK_RESTRICT src,
944 int count, U8CPU alpha) { 946 int count, U8CPU alpha) {
(...skipping 12 matching lines...) Expand all
957 uint16x8_t vdst_wide, vsrc_wide; 959 uint16x8_t vdst_wide, vsrc_wide;
958 unsigned dst_scale; 960 unsigned dst_scale;
959 961
960 // Load 962 // Load
961 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 963 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
962 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 964 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
963 965
964 // Calc dst_scale 966 // Calc dst_scale
965 dst_scale = vget_lane_u8(vsrc, 3); 967 dst_scale = vget_lane_u8(vsrc, 3);
966 dst_scale *= alpha256; 968 dst_scale *= alpha256;
969 dst_scale = (256<<8) - dst_scale;
967 dst_scale >>= 8; 970 dst_scale >>= 8;
968 dst_scale = 256 - dst_scale;
969 971
970 // Process src 972 // Process src
971 vsrc_wide = vmovl_u8(vsrc); 973 vsrc_wide = vmovl_u8(vsrc);
972 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 974 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
973 975
974 // Process dst 976 // Process dst
975 vdst_wide = vmovl_u8(vdst); 977 vdst_wide = vmovl_u8(vdst);
976 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 978 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
977 979
978 // Combine 980 // Combine
979 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 981 vdst_wide += vsrc_wide;
982 vres = vshrn_n_u16(vdst_wide, 8);
980 983
981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 984 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
982 dst++; 985 dst++;
983 src++; 986 src++;
984 count--; 987 count--;
985 } 988 }
986 989
987 if (count) { 990 if (count) {
988 uint8x8_t alpha_mask; 991 uint8x8_t alpha_mask;
989 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 992 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
(...skipping 10 matching lines...) Expand all
1000 // Load 1003 // Load
1001 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1004 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1002 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1005 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1003 1006
1004 // Prepare src_scale 1007 // Prepare src_scale
1005 vsrc_scale = vdupq_n_u16(alpha256); 1008 vsrc_scale = vdupq_n_u16(alpha256);
1006 1009
1007 // Calc dst_scale 1010 // Calc dst_scale
1008 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1011 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1009 vdst_scale = vmovl_u8(vsrc_alphas); 1012 vdst_scale = vmovl_u8(vsrc_alphas);
1010 vdst_scale *= vsrc_scale; 1013 vdst_scale = vmlsq_u16(vdupq_n_u16(255<<8), vdst_scale, vsrc_scale);
1011 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1014 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
1012 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1013 1015
1014 // Process src 1016 // Process src
1015 vsrc_wide = vmovl_u8(vsrc); 1017 vsrc_wide = vmovl_u8(vsrc);
1016 vsrc_wide *= vsrc_scale; 1018 vsrc_wide *= vsrc_scale;
1017 1019
1018 // Process dst 1020 // Process dst
1019 vdst_wide = vmovl_u8(vdst); 1021 vdst_wide = vmovl_u8(vdst);
1020 vdst_wide *= vdst_scale; 1022 vdst_wide *= vdst_scale;
1021 1023
1022 // Combine 1024 // Combine
1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1025 vdst_wide += vsrc_wide;
1026 vres = vshrn_n_u16(vdst_wide, 8);
1024 1027
1025 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1028 vst1_u32(dst, vreinterpret_u32_u8(vres));
1026 1029
1027 src += 2; 1030 src += 2;
1028 dst += 2; 1031 dst += 2;
1029 count -= 2; 1032 count -= 2;
1030 } while(count); 1033 } while(count);
1031 } 1034 }
1032 } 1035 }
1033 1036
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after
1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1288 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1286 nullptr, // S32_Opaque, 1289 nullptr, // S32_Opaque,
1287 S32_Blend_BlitRow32_neon, // S32_Blend, 1290 S32_Blend_BlitRow32_neon, // S32_Blend,
1288 nullptr, // Ported to SkOpts 1291 nullptr, // Ported to SkOpts
1289 #ifdef SK_CPU_ARM32 1292 #ifdef SK_CPU_ARM32
1290 S32A_Blend_BlitRow32_neon // S32A_Blend 1293 S32A_Blend_BlitRow32_neon // S32A_Blend
1291 #else 1294 #else
1292 nullptr 1295 nullptr
1293 #endif 1296 #endif
1294 }; 1297 };
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698