Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(84)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 177963003: ARM Skia NEON patches - 25 - S32A_D565_Opaque_Dither clean/bugfix/speed (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Add ignored-tests.txt Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 952 matching lines...) Expand 10 before | Expand all | Expand 10 after
963 #endif 963 #endif
964 964
965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
966 const SkPMColor* SK_RESTRICT src, 966 const SkPMColor* SK_RESTRICT src,
967 int count, U8CPU alpha, int x, int y) { 967 int count, U8CPU alpha, int x, int y) {
968 SkASSERT(255 == alpha); 968 SkASSERT(255 == alpha);
969 969
970 #define UNROLL 8 970 #define UNROLL 8
971 971
972 if (count >= UNROLL) { 972 if (count >= UNROLL) {
973 uint8x8_t dbase;
974 973
975 #if defined(DEBUG_OPAQUE_DITHER) 974 #if defined(DEBUG_OPAQUE_DITHER)
976 uint16_t tmpbuf[UNROLL]; 975 uint16_t tmpbuf[UNROLL];
977 int td[UNROLL]; 976 int td[UNROLL];
978 int tdv[UNROLL]; 977 int tdv[UNROLL];
979 int ta[UNROLL]; 978 int ta[UNROLL];
980 int tap[UNROLL]; 979 int tap[UNROLL];
981 uint16_t in_dst[UNROLL]; 980 uint16_t in_dst[UNROLL];
982 int offset = 0; 981 int offset = 0;
983 int noisy = 0; 982 int noisy = 0;
984 #endif 983 #endif
985 984
985 uint8x8_t dbase;
986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
987 dbase = vld1_u8(dstart); 987 dbase = vld1_u8(dstart);
988 988
989 do { 989 do {
990 uint8x8_t sr, sg, sb, sa, d; 990 uint8x8_t sr, sg, sb, sa, d;
991 uint16x8_t dst8, scale8, alpha8; 991 uint16x8_t dst8, scale8, alpha8;
992 uint16x8_t dst_r, dst_g, dst_b; 992 uint16x8_t dst_r, dst_g, dst_b;
993 993
994 #if defined(DEBUG_OPAQUE_DITHER) 994 #if defined(DEBUG_OPAQUE_DITHER)
995 /* calculate 8 elements worth into a temp buffer */ 995 // calculate 8 elements worth into a temp buffer
996 { 996 {
997 int my_y = y; 997 int my_y = y;
998 int my_x = x; 998 int my_x = x;
999 SkPMColor* my_src = (SkPMColor*)src; 999 SkPMColor* my_src = (SkPMColor*)src;
1000 uint16_t* my_dst = dst; 1000 uint16_t* my_dst = dst;
1001 int i; 1001 int i;
1002 1002
1003 DITHER_565_SCAN(my_y); 1003 DITHER_565_SCAN(my_y);
1004 for(i=0;i<UNROLL;i++) { 1004 for(i = 0; i < UNROLL; i++) {
1005 SkPMColor c = *my_src++; 1005 SkPMColor c = *my_src++;
1006 SkPMColorAssert(c); 1006 SkPMColorAssert(c);
1007 if (c) { 1007 if (c) {
1008 unsigned a = SkGetPackedA32(c); 1008 unsigned a = SkGetPackedA32(c);
1009 1009
1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1011 tdv[i] = DITHER_VALUE(my_x); 1011 tdv[i] = DITHER_VALUE(my_x);
1012 ta[i] = a; 1012 ta[i] = a;
1013 tap[i] = SkAlpha255To256(a); 1013 tap[i] = SkAlpha255To256(a);
1014 td[i] = d; 1014 td[i] = d;
1015 1015
1016 unsigned sr = SkGetPackedR32(c); 1016 unsigned sr = SkGetPackedR32(c);
1017 unsigned sg = SkGetPackedG32(c); 1017 unsigned sg = SkGetPackedG32(c);
1018 unsigned sb = SkGetPackedB32(c); 1018 unsigned sb = SkGetPackedB32(c);
1019 sr = SkDITHER_R32_FOR_565(sr, d); 1019 sr = SkDITHER_R32_FOR_565(sr, d);
1020 sg = SkDITHER_G32_FOR_565(sg, d); 1020 sg = SkDITHER_G32_FOR_565(sg, d);
1021 sb = SkDITHER_B32_FOR_565(sb, d); 1021 sb = SkDITHER_B32_FOR_565(sb, d);
1022 1022
1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10
1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) ; 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) ;
1028 td[i] = d; 1028 td[i] = d;
1029
1030 } else { 1029 } else {
1031 tmpbuf[i] = *my_dst; 1030 tmpbuf[i] = *my_dst;
1032 ta[i] = tdv[i] = td[i] = 0xbeef; 1031 ta[i] = tdv[i] = td[i] = 0xbeef;
1033 } 1032 }
1034 in_dst[i] = *my_dst; 1033 in_dst[i] = *my_dst;
1035 my_dst += 1; 1034 my_dst += 1;
1036 DITHER_INC_X(my_x); 1035 DITHER_INC_X(my_x);
1037 } 1036 }
1038 } 1037 }
1039 #endif 1038 #endif
1040 1039
1041 /* source is in ABGR */ 1040
1042 { 1041 {
1043 register uint8x8_t d0 asm("d0"); 1042 register uint8x8_t d0 asm("d0");
1044 register uint8x8_t d1 asm("d1"); 1043 register uint8x8_t d1 asm("d1");
1045 register uint8x8_t d2 asm("d2"); 1044 register uint8x8_t d2 asm("d2");
1046 register uint8x8_t d3 asm("d3"); 1045 register uint8x8_t d3 asm("d3");
1047 1046
1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1050 : "r" (src) 1049 :
1051 ); 1050 );
1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1052 sr = d2; sg = d1; sb = d0; sa = d3;
1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1052 sr = d0; sg = d1; sb = d2; sa = d3; 1054 sr = d0; sg = d1; sb = d2; sa = d3;
1055 #endif
1053 } 1056 }
1054 1057
1055 /* calculate 'd', which will be 0..7 */ 1058 /* calculate 'd', which will be 0..7
1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1057 #if defined(SK_BUILD_FOR_ANDROID) 1060 */
1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1061 alpha8 = vmovl_u8(dbase);
1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1062 alpha8 = vmlal_u8(alpha8, sa, dbase);
1060 #else 1063 d = vshrn_n_u16(alpha8, 8); // narrowing too
1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1062 #endif
1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
1065 1064
1066 /* sr = sr - (sr>>5) + d */ 1065 // sr = sr - (sr>>5) + d
1067 /* watching for 8-bit overflow. d is 0..7; risky range of 1066 /* watching for 8-bit overflow. d is 0..7; risky range of
1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1069 * safe as long as we do ((sr-sr>>5) + d) */ 1068 * safe as long as we do ((sr-sr>>5) + d)
1069 */
1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1071 sr = vadd_u8(sr, d); 1071 sr = vadd_u8(sr, d);
1072 1072
1073 /* sb = sb - (sb>>5) + d */ 1073 // sb = sb - (sb>>5) + d
1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1075 sb = vadd_u8(sb, d); 1075 sb = vadd_u8(sb, d);
1076 1076
1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1079 sg = vadd_u8(sg, vshr_n_u8(d,1));
1080 1080
1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1082 dst8 = vld1q_u16(dst); 1082 dst8 = vld1q_u16(dst);
1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16 _BITS);
1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1086 1086
1087 /* blend */ 1087 // blend
1088 #if 1
1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1090 /* originally 255-sa + 1 */
1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1092 #else
1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1095 #endif
1096 1089
1097 #if 1 1090 // combine the addq and mul, save 3 insns
1098 /* combine the addq and mul, save 3 insns */
1099 scale8 = vshrq_n_u16(scale8, 3); 1091 scale8 = vshrq_n_u16(scale8, 3);
1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1103 #else
1104 /* known correct, but +3 insns over above */
1105 scale8 = vshrq_n_u16(scale8, 3);
1106 dst_b = vmulq_u16(dst_b, scale8);
1107 dst_g = vmulq_u16(dst_g, scale8);
1108 dst_r = vmulq_u16(dst_r, scale8);
1109 1095
1110 /* combine */ 1096 // repack to store
1111 /* NB: vshll widens, need to preserve those bits */ 1097 dst8 = vshrq_n_u16(dst_b, 5);
1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1115 #endif
1116
1117 /* repack to store */
1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1121 1100
1122 vst1q_u16(dst, dst8); 1101 vst1q_u16(dst, dst8);
1123 1102
1124 #if defined(DEBUG_OPAQUE_DITHER) 1103 #if defined(DEBUG_OPAQUE_DITHER)
1125 /* verify my 8 elements match the temp buffer */ 1104 // verify my 8 elements match the temp buffer
1126 { 1105 {
1127 int i, bad=0; 1106 int i, bad=0;
1128 static int invocation; 1107 static int invocation;
1129 1108
1130 for (i=0;i<UNROLL;i++) 1109 for (i = 0; i < UNROLL; i++) {
1131 if (tmpbuf[i] != dst[i]) bad=1; 1110 if (tmpbuf[i] != dst[i]) {
1132 if (bad) { 1111 bad=1;
1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n" , 1112 }
1134 invocation, offset); 1113 }
1135 SkDebugf(" alpha 0x%x\n", alpha); 1114 if (bad) {
1136 for (i=0;i<UNROLL;i++) 1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset % d\n",
1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n ", 1116 invocation, offset);
1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1117 SkDebugf(" alpha 0x%x\n", alpha);
1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1118 for (i = 0; i < UNROLL; i++)
1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0 4x\n",
1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[ i],
1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1140 1122
1141 showme16("alpha8", &alpha8, sizeof(alpha8)); 1123 showme16("alpha8", &alpha8, sizeof(alpha8));
1142 showme16("scale8", &scale8, sizeof(scale8)); 1124 showme16("scale8", &scale8, sizeof(scale8));
1143 showme8("d", &d, sizeof(d)); 1125 showme8("d", &d, sizeof(d));
1144 showme16("dst8", &dst8, sizeof(dst8)); 1126 showme16("dst8", &dst8, sizeof(dst8));
1145 showme16("dst_b", &dst_b, sizeof(dst_b)); 1127 showme16("dst_b", &dst_b, sizeof(dst_b));
1146 showme16("dst_g", &dst_g, sizeof(dst_g)); 1128 showme16("dst_g", &dst_g, sizeof(dst_g));
1147 showme16("dst_r", &dst_r, sizeof(dst_r)); 1129 showme16("dst_r", &dst_r, sizeof(dst_r));
1148 showme8("sb", &sb, sizeof(sb)); 1130 showme8("sb", &sb, sizeof(sb));
1149 showme8("sg", &sg, sizeof(sg)); 1131 showme8("sg", &sg, sizeof(sg));
1150 showme8("sr", &sr, sizeof(sr)); 1132 showme8("sr", &sr, sizeof(sr));
1151 1133
1152 /* cop out */ 1134 return;
1153 return; 1135 }
1154 } 1136 offset += UNROLL;
1155 offset += UNROLL; 1137 invocation++;
1156 invocation++; 1138 }
1157 }
1158 #endif 1139 #endif
1159 1140 dst += UNROLL;
1160 dst += UNROLL;
1161 src += UNROLL;
1162 count -= UNROLL; 1141 count -= UNROLL;
1163 /* skip x += UNROLL, since it's unchanged mod-4 */ 1142 // skip x += UNROLL, since it's unchanged mod-4
1164 } while (count >= UNROLL); 1143 } while (count >= UNROLL);
1165 } 1144 }
1166 #undef UNROLL 1145 #undef UNROLL
1167 1146
1168 /* residuals */ 1147 // residuals
1169 if (count > 0) { 1148 if (count > 0) {
1170 DITHER_565_SCAN(y); 1149 DITHER_565_SCAN(y);
1171 do { 1150 do {
1172 SkPMColor c = *src++; 1151 SkPMColor c = *src++;
1173 SkPMColorAssert(c); 1152 SkPMColorAssert(c);
1174 if (c) { 1153 if (c) {
1175 unsigned a = SkGetPackedA32(c); 1154 unsigned a = SkGetPackedA32(c);
1176 1155
1177 // dither and alpha are just temporary variables to work-around 1156 // dither and alpha are just temporary variables to work-around
1178 // an ICE in debug. 1157 // an ICE in debug.
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after
1434 * case where we do not inspect the src alpha. 1413 * case where we do not inspect the src alpha.
1435 */ 1414 */
1436 #if SK_A32_SHIFT == 24 1415 #if SK_A32_SHIFT == 24
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1439 #else 1418 #else
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1441 #endif 1420 #endif
1442 S32A_Blend_BlitRow32_neon // S32A_Blend 1421 S32A_Blend_BlitRow32_neon // S32A_Blend
1443 }; 1422 };
OLDNEW
« expectations/gm/ignored-tests.txt ('K') | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698