src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its…

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its… (Closed) Base URL: https://skia.googlesource.com/skia@master

Patch Set: guard more changes with SK_SUPPORT_LEGACY_BROKEN_LERP Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 891 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
902 vdst = vreinterpret_u8_u32(vld1_u32(dst));	902 vdst = vreinterpret_u8_u32(vld1_u32(dst));

903	903

904 // Process src	904 // Process src

905 vsrc_wide = vmovl_u8(vsrc);	905 vsrc_wide = vmovl_u8(vsrc);

906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));	906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

907	907

908 // Process dst	908 // Process dst

909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));	909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

910	910

911 // Combine	911 // Combine

	912 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

912 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);	913 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	914 #else

	915 vdst_wide += vsrc_wide;

	916 vres = vshrn_n_u16(vdst_wide, 8);

	917 #endif

913	918

914 // Store	919 // Store

915 vst1_u32(dst, vreinterpret_u32_u8(vres));	920 vst1_u32(dst, vreinterpret_u32_u8(vres));

916	921

917 src += 2;	922 src += 2;

918 dst += 2;	923 dst += 2;

919 count -= 2;	924 count -= 2;

920 }	925 }

921	926

922 if (count == 1) {	927 if (count == 1) {

923 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;	928 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;

924 uint16x8_t vsrc_wide, vdst_wide;	929 uint16x8_t vsrc_wide, vdst_wide;

925	930

926 // Load	931 // Load

927 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));	932 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

928 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));	933 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

929	934

930 // Process	935 // Process

931 vsrc_wide = vmovl_u8(vsrc);	936 vsrc_wide = vmovl_u8(vsrc);

932 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));	937 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

933 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));	938 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

	939 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

934 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);	940 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	941 #else

	942 vdst_wide += vsrc_wide;

	943 vres = vshrn_n_u16(vdst_wide, 8);

	944 #endif

935	945

936 // Store	946 // Store

937 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);	947 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

938 }	948 }

939 }	949 }

940	950

941 #ifdef SK_CPU_ARM32	951 #ifdef SK_CPU_ARM32

942 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	952 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

943 const SkPMColor* SK_RESTRICT src,	953 const SkPMColor* SK_RESTRICT src,

944 int count, U8CPU alpha) {	954 int count, U8CPU alpha) {

945	955

946 SkASSERT(255 >= alpha);	956 SkASSERT(255 > alpha);

947	957

948 if (count <= 0) {	958 if (count <= 0) {

949 return;	959 return;

950 }	960 }

951	961

952 unsigned alpha256 = SkAlpha255To256(alpha);	962 unsigned alpha256 = SkAlpha255To256(alpha);

953	963

954 // First deal with odd counts	964 // First deal with odd counts

955 if (count & 1) {	965 if (count & 1) {

956 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;	966 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;

957 uint16x8_t vdst_wide, vsrc_wide;	967 uint16x8_t vdst_wide, vsrc_wide;

958 unsigned dst_scale;	968 unsigned dst_scale;

959	969

960 // Load	970 // Load

961 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));	971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

962 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));	972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

963	973

964 // Calc dst_scale	974 // Calc dst_scale

965 dst_scale = vget_lane_u8(vsrc, 3);	975 dst_scale = vget_lane_u8(vsrc, 3);

966 dst_scale *= alpha256;	976 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);

967 dst_scale >>= 8;

968 dst_scale = 256 - dst_scale;

969	977

970 // Process src	978 // Process src

971 vsrc_wide = vmovl_u8(vsrc);	979 vsrc_wide = vmovl_u8(vsrc);

972 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);	980 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);

973	981

974 // Process dst	982 // Process dst

975 vdst_wide = vmovl_u8(vdst);	983 vdst_wide = vmovl_u8(vdst);

976 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);	984 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);

977	985

978 // Combine	986 // Combine

	987 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

979 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);	988 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	989 #else

	990 vdst_wide += vsrc_wide;

	991 vres = vshrn_n_u16(vdst_wide, 8);

	992 #endif

980	993

981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);	994 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

982 dst++;	995 dst++;

983 src++;	996 src++;

984 count--;	997 count--;

985 }	998 }

986	999

987 if (count) {	1000 if (count) {

988 uint8x8_t alpha_mask;	1001 uint8x8_t alpha_mask;

989 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};	1002 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

(...skipping 10 matching lines...) Expand all Loading...
1000 // Load	1013 // Load

1001 vsrc = vreinterpret_u8_u32(vld1_u32(src));	1014 vsrc = vreinterpret_u8_u32(vld1_u32(src));

1002 vdst = vreinterpret_u8_u32(vld1_u32(dst));	1015 vdst = vreinterpret_u8_u32(vld1_u32(dst));

1003	1016

1004 // Prepare src_scale	1017 // Prepare src_scale

1005 vsrc_scale = vdupq_n_u16(alpha256);	1018 vsrc_scale = vdupq_n_u16(alpha256);

1006	1019

1007 // Calc dst_scale	1020 // Calc dst_scale

1008 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);	1021 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);

1009 vdst_scale = vmovl_u8(vsrc_alphas);	1022 vdst_scale = vmovl_u8(vsrc_alphas);

	1023 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

1010 vdst_scale *= vsrc_scale;	1024 vdst_scale *= vsrc_scale;

1011 vdst_scale = vshrq_n_u16(vdst_scale, 8);	1025 vdst_scale = vshrq_n_u16(vdst_scale, 8);

1012 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);	1026 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);

	1027 #else

	1028 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).

	1029 // A 16-bit lane would overflow if we used 0xFFFF here,

	1030 // so use an approximation with 0xFF00 that is off by 1,

	1031 // and add back 1 after to get the correct value.

	1032 // This is valid if alpha256 <= 255.

	1033 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);

	1034 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);

	1035 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);

	1036 #endif

1013	1037

1014 // Process src	1038 // Process src

1015 vsrc_wide = vmovl_u8(vsrc);	1039 vsrc_wide = vmovl_u8(vsrc);

1016 vsrc_wide *= vsrc_scale;	1040 vsrc_wide *= vsrc_scale;

1017	1041

1018 // Process dst	1042 // Process dst

1019 vdst_wide = vmovl_u8(vdst);	1043 vdst_wide = vmovl_u8(vdst);

1020 vdst_wide *= vdst_scale;	1044 vdst_wide *= vdst_scale;

1021	1045

1022 // Combine	1046 // Combine

	1047 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);	1048 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	1049 #else

	1050 vdst_wide += vsrc_wide;

	1051 vres = vshrn_n_u16(vdst_wide, 8);

	1052 #endif

1024	1053

1025 vst1_u32(dst, vreinterpret_u32_u8(vres));	1054 vst1_u32(dst, vreinterpret_u32_u8(vres));

1026	1055

1027 src += 2;	1056 src += 2;

1028 dst += 2;	1057 dst += 2;

1029 count -= 2;	1058 count -= 2;

1030 } while(count);	1059 } while(count);

1031 }	1060 }

1032 }	1061 }

1033	1062

(...skipping 251 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1314 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1286 nullptr, // S32_Opaque,	1315 nullptr, // S32_Opaque,

1287 S32_Blend_BlitRow32_neon, // S32_Blend,	1316 S32_Blend_BlitRow32_neon, // S32_Blend,

1288 nullptr, // Ported to SkOpts	1317 nullptr, // Ported to SkOpts

1289 #ifdef SK_CPU_ARM32	1318 #ifdef SK_CPU_ARM32

1290 S32A_Blend_BlitRow32_neon // S32A_Blend	1319 S32A_Blend_BlitRow32_neon // S32A_Blend

1291 #else	1320 #else

1292 nullptr	1321 nullptr

1293 #endif	1322 #endif

1294 };	1323 };

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE2.cpp ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »