src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 177963003: ARM Skia NEON patches - 25 - S32A_D565_Opaque_Dither clean/bugfix/speed

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 177963003: ARM Skia NEON patches - 25 - S32A_D565_Opaque_Dither clean/bugfix/speed (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Add ignored-tests.txt Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 952 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
963 #endif	963 #endif

964	964

965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,	965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

966 const SkPMColor* SK_RESTRICT src,	966 const SkPMColor* SK_RESTRICT src,

967 int count, U8CPU alpha, int x, int y) {	967 int count, U8CPU alpha, int x, int y) {

968 SkASSERT(255 == alpha);	968 SkASSERT(255 == alpha);

969	969

970 #define UNROLL 8	970 #define UNROLL 8

971	971

972 if (count >= UNROLL) {	972 if (count >= UNROLL) {

973 uint8x8_t dbase;

974	973

975 #if defined(DEBUG_OPAQUE_DITHER)	974 #if defined(DEBUG_OPAQUE_DITHER)

976 uint16_t tmpbuf[UNROLL];	975 uint16_t tmpbuf[UNROLL];

977 int td[UNROLL];	976 int td[UNROLL];

978 int tdv[UNROLL];	977 int tdv[UNROLL];

979 int ta[UNROLL];	978 int ta[UNROLL];

980 int tap[UNROLL];	979 int tap[UNROLL];

981 uint16_t in_dst[UNROLL];	980 uint16_t in_dst[UNROLL];

982 int offset = 0;	981 int offset = 0;

983 int noisy = 0;	982 int noisy = 0;

984 #endif	983 #endif

985	984

	985 uint8x8_t dbase;

986 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	986 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

987 dbase = vld1_u8(dstart);	987 dbase = vld1_u8(dstart);

988	988

989 do {	989 do {

990 uint8x8_t sr, sg, sb, sa, d;	990 uint8x8_t sr, sg, sb, sa, d;

991 uint16x8_t dst8, scale8, alpha8;	991 uint16x8_t dst8, scale8, alpha8;

992 uint16x8_t dst_r, dst_g, dst_b;	992 uint16x8_t dst_r, dst_g, dst_b;

993	993

994 #if defined(DEBUG_OPAQUE_DITHER)	994 #if defined(DEBUG_OPAQUE_DITHER)

995 /* calculate 8 elements worth into a temp buffer */	995 // calculate 8 elements worth into a temp buffer

996 {	996 {

997 int my_y = y;	997 int my_y = y;

998 int my_x = x;	998 int my_x = x;

999 SkPMColor* my_src = (SkPMColor*)src;	999 SkPMColor* my_src = (SkPMColor*)src;

1000 uint16_t* my_dst = dst;	1000 uint16_t* my_dst = dst;

1001 int i;	1001 int i;

1002	1002

1003 DITHER_565_SCAN(my_y);	1003 DITHER_565_SCAN(my_y);

1004 for(i=0;i<UNROLL;i++) {	1004 for(i = 0; i < UNROLL; i++) {

1005 SkPMColor c = *my_src++;	1005 SkPMColor c = *my_src++;

1006 SkPMColorAssert(c);	1006 SkPMColorAssert(c);

1007 if (c) {	1007 if (c) {

1008 unsigned a = SkGetPackedA32(c);	1008 unsigned a = SkGetPackedA32(c);

1009	1009

1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));	1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));

1011 tdv[i] = DITHER_VALUE(my_x);	1011 tdv[i] = DITHER_VALUE(my_x);

1012 ta[i] = a;	1012 ta[i] = a;

1013 tap[i] = SkAlpha255To256(a);	1013 tap[i] = SkAlpha255To256(a);

1014 td[i] = d;	1014 td[i] = d;

1015	1015

1016 unsigned sr = SkGetPackedR32(c);	1016 unsigned sr = SkGetPackedR32(c);

1017 unsigned sg = SkGetPackedG32(c);	1017 unsigned sg = SkGetPackedG32(c);

1018 unsigned sb = SkGetPackedB32(c);	1018 unsigned sb = SkGetPackedB32(c);

1019 sr = SkDITHER_R32_FOR_565(sr, d);	1019 sr = SkDITHER_R32_FOR_565(sr, d);

1020 sg = SkDITHER_G32_FOR_565(sg, d);	1020 sg = SkDITHER_G32_FOR_565(sg, d);

1021 sb = SkDITHER_B32_FOR_565(sb, d);	1021 sb = SkDITHER_B32_FOR_565(sb, d);

1022	1022

1023 uint32_t src_expanded = (sg << 24) \| (sr << 13) \| (sb << 2);	1023 uint32_t src_expanded = (sg << 24) \| (sr << 13) \| (sb << 2);

1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);	1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);

1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);	1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

1026 // now src and dst expanded are in g:11 r:10 x:1 b:10	1026 // now src and dst expanded are in g:11 r:10 x:1 b:10

1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) ;	1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) ;

1028 td[i] = d;	1028 td[i] = d;

1029

1030 } else {	1029 } else {

1031 tmpbuf[i] = *my_dst;	1030 tmpbuf[i] = *my_dst;

1032 ta[i] = tdv[i] = td[i] = 0xbeef;	1031 ta[i] = tdv[i] = td[i] = 0xbeef;

1033 }	1032 }

1034 in_dst[i] = *my_dst;	1033 in_dst[i] = *my_dst;

1035 my_dst += 1;	1034 my_dst += 1;

1036 DITHER_INC_X(my_x);	1035 DITHER_INC_X(my_x);

1037 }	1036 }

1038 }	1037 }

1039 #endif	1038 #endif

1040	1039

1041 /* source is in ABGR */	1040

1042 {	1041 {

1043 register uint8x8_t d0 asm("d0");	1042 register uint8x8_t d0 asm("d0");

1044 register uint8x8_t d1 asm("d1");	1043 register uint8x8_t d1 asm("d1");

1045 register uint8x8_t d2 asm("d2");	1044 register uint8x8_t d2 asm("d2");

1046 register uint8x8_t d3 asm("d3");	1045 register uint8x8_t d3 asm("d3");

1047	1046

1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"

1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)	1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)

1050 : "r" (src)	1049 :

1051 );	1050 );

	1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

	1052 sr = d2; sg = d1; sb = d0; sa = d3;

	1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

1052 sr = d0; sg = d1; sb = d2; sa = d3;	1054 sr = d0; sg = d1; sb = d2; sa = d3;

	1055 #endif

1053 }	1056 }

1054	1057

1055 /* calculate 'd', which will be 0..7 */	1058 /* calculate 'd', which will be 0..7

1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */	1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice

1057 #if defined(SK_BUILD_FOR_ANDROID)	1060 */

1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */	1061 alpha8 = vmovl_u8(dbase);

1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));	1062 alpha8 = vmlal_u8(alpha8, sa, dbase);

1060 #else	1063 d = vshrn_n_u16(alpha8, 8); // narrowing too

1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));

1062 #endif

1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));

1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */

1065	1064

1066 /* sr = sr - (sr>>5) + d */	1065 // sr = sr - (sr>>5) + d

1067 /* watching for 8-bit overflow. d is 0..7; risky range of	1066 /* watching for 8-bit overflow. d is 0..7; risky range of

1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';	1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';

1069 * safe as long as we do ((sr-sr>>5) + d) */	1068 * safe as long as we do ((sr-sr>>5) + d)

	1069 */

1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5));	1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5));

1071 sr = vadd_u8(sr, d);	1071 sr = vadd_u8(sr, d);

1072	1072

1073 /* sb = sb - (sb>>5) + d */	1073 // sb = sb - (sb>>5) + d

1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5));	1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5));

1075 sb = vadd_u8(sb, d);	1075 sb = vadd_u8(sb, d);

1076	1076

1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */	1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows

1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6));	1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6));

1079 sg = vadd_u8(sg, vshr_n_u8(d,1));	1079 sg = vadd_u8(sg, vshr_n_u8(d,1));

1080	1080

1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */	1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits

1082 dst8 = vld1q_u16(dst);	1082 dst8 = vld1q_u16(dst);

1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));	1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));

1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));	1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16 _BITS);

1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */	1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits

1086	1086

1087 /* blend */	1087 // blend

1088 #if 1

1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */

1090 /* originally 255-sa + 1 */

1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa);	1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa);

1092 #else

1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa);

1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));

1095 #endif

1096	1089

1097 #if 1	1090 // combine the addq and mul, save 3 insns

1098 /* combine the addq and mul, save 3 insns */

1099 scale8 = vshrq_n_u16(scale8, 3);	1091 scale8 = vshrq_n_u16(scale8, 3);

1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);	1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);

1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);	1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);

1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);	1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);

1103 #else

1104 /* known correct, but +3 insns over above */

1105 scale8 = vshrq_n_u16(scale8, 3);

1106 dst_b = vmulq_u16(dst_b, scale8);

1107 dst_g = vmulq_u16(dst_g, scale8);

1108 dst_r = vmulq_u16(dst_r, scale8);

1109	1095

1110 /* combine */	1096 // repack to store

1111 /* NB: vshll widens, need to preserve those bits */	1097 dst8 = vshrq_n_u16(dst_b, 5);

1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));

1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));

1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));

1115 #endif

1116

1117 /* repack to store */

1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));

1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);	1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);

1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);	1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);

1121	1100

1122 vst1q_u16(dst, dst8);	1101 vst1q_u16(dst, dst8);

1123	1102

1124 #if defined(DEBUG_OPAQUE_DITHER)	1103 #if defined(DEBUG_OPAQUE_DITHER)

1125 /* verify my 8 elements match the temp buffer */	1104 // verify my 8 elements match the temp buffer

1126 {	1105 {

1127 int i, bad=0;	1106 int i, bad=0;

1128 static int invocation;	1107 static int invocation;

1129	1108

1130 for (i=0;i<UNROLL;i++)	1109 for (i = 0; i < UNROLL; i++) {

1131 if (tmpbuf[i] != dst[i]) bad=1;	1110 if (tmpbuf[i] != dst[i]) {

1132 if (bad) {	1111 bad=1;

1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n" ,	1112 }

1134 invocation, offset);	1113 }

1135 SkDebugf(" alpha 0x%x\n", alpha);	1114 if (bad) {

1136 for (i=0;i<UNROLL;i++)	1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset % d\n",

1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n ",	1116 invocation, offset);

1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),	1117 SkDebugf(" alpha 0x%x\n", alpha);

1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);	1118 for (i = 0; i < UNROLL; i++)

	1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0 4x\n",

	1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[ i],

	1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);

1140	1122

1141 showme16("alpha8", &alpha8, sizeof(alpha8));	1123 showme16("alpha8", &alpha8, sizeof(alpha8));

1142 showme16("scale8", &scale8, sizeof(scale8));	1124 showme16("scale8", &scale8, sizeof(scale8));

1143 showme8("d", &d, sizeof(d));	1125 showme8("d", &d, sizeof(d));

1144 showme16("dst8", &dst8, sizeof(dst8));	1126 showme16("dst8", &dst8, sizeof(dst8));

1145 showme16("dst_b", &dst_b, sizeof(dst_b));	1127 showme16("dst_b", &dst_b, sizeof(dst_b));

1146 showme16("dst_g", &dst_g, sizeof(dst_g));	1128 showme16("dst_g", &dst_g, sizeof(dst_g));

1147 showme16("dst_r", &dst_r, sizeof(dst_r));	1129 showme16("dst_r", &dst_r, sizeof(dst_r));

1148 showme8("sb", &sb, sizeof(sb));	1130 showme8("sb", &sb, sizeof(sb));

1149 showme8("sg", &sg, sizeof(sg));	1131 showme8("sg", &sg, sizeof(sg));

1150 showme8("sr", &sr, sizeof(sr));	1132 showme8("sr", &sr, sizeof(sr));

1151	1133

1152 /* cop out */	1134 return;

1153 return;	1135 }

1154 }	1136 offset += UNROLL;

1155 offset += UNROLL;	1137 invocation++;

1156 invocation++;	1138 }

1157 }

1158 #endif	1139 #endif

1159	1140 dst += UNROLL;

1160 dst += UNROLL;

1161 src += UNROLL;

1162 count -= UNROLL;	1141 count -= UNROLL;

1163 /* skip x += UNROLL, since it's unchanged mod-4 */	1142 // skip x += UNROLL, since it's unchanged mod-4

1164 } while (count >= UNROLL);	1143 } while (count >= UNROLL);

1165 }	1144 }

1166 #undef UNROLL	1145 #undef UNROLL

1167	1146

1168 /* residuals */	1147 // residuals

1169 if (count > 0) {	1148 if (count > 0) {

1170 DITHER_565_SCAN(y);	1149 DITHER_565_SCAN(y);

1171 do {	1150 do {

1172 SkPMColor c = *src++;	1151 SkPMColor c = *src++;

1173 SkPMColorAssert(c);	1152 SkPMColorAssert(c);

1174 if (c) {	1153 if (c) {

1175 unsigned a = SkGetPackedA32(c);	1154 unsigned a = SkGetPackedA32(c);

1176	1155

1177 // dither and alpha are just temporary variables to work-around	1156 // dither and alpha are just temporary variables to work-around

1178 // an ICE in debug.	1157 // an ICE in debug.

(...skipping 255 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1434 * case where we do not inspect the src alpha.	1413 * case where we do not inspect the src alpha.

1435 */	1414 */

1436 #if SK_A32_SHIFT == 24	1415 #if SK_A32_SHIFT == 24

1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1439 #else	1418 #else

1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1441 #endif	1420 #endif

1442 S32A_Blend_BlitRow32_neon // S32A_Blend	1421 S32A_Blend_BlitRow32_neon // S32A_Blend

1443 };	1422 };

OLD	NEW

« expectations/gm/ignored-tests.txt ('K') | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »