| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 952 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 963 #endif | 963 #endif |
| 964 | 964 |
| 965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| 966 const SkPMColor* SK_RESTRICT src, | 966 const SkPMColor* SK_RESTRICT src, |
| 967 int count, U8CPU alpha, int x, int y) { | 967 int count, U8CPU alpha, int x, int y) { |
| 968 SkASSERT(255 == alpha); | 968 SkASSERT(255 == alpha); |
| 969 | 969 |
| 970 #define UNROLL 8 | 970 #define UNROLL 8 |
| 971 | 971 |
| 972 if (count >= UNROLL) { | 972 if (count >= UNROLL) { |
| 973 uint8x8_t dbase; | |
| 974 | 973 |
| 975 #if defined(DEBUG_OPAQUE_DITHER) | 974 #if defined(DEBUG_OPAQUE_DITHER) |
| 976 uint16_t tmpbuf[UNROLL]; | 975 uint16_t tmpbuf[UNROLL]; |
| 977 int td[UNROLL]; | 976 int td[UNROLL]; |
| 978 int tdv[UNROLL]; | 977 int tdv[UNROLL]; |
| 979 int ta[UNROLL]; | 978 int ta[UNROLL]; |
| 980 int tap[UNROLL]; | 979 int tap[UNROLL]; |
| 981 uint16_t in_dst[UNROLL]; | 980 uint16_t in_dst[UNROLL]; |
| 982 int offset = 0; | 981 int offset = 0; |
| 983 int noisy = 0; | 982 int noisy = 0; |
| 984 #endif | 983 #endif |
| 985 | 984 |
| 985 uint8x8_t dbase; |
| 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 987 dbase = vld1_u8(dstart); | 987 dbase = vld1_u8(dstart); |
| 988 | 988 |
| 989 do { | 989 do { |
| 990 uint8x8_t sr, sg, sb, sa, d; | 990 uint8x8_t sr, sg, sb, sa, d; |
| 991 uint16x8_t dst8, scale8, alpha8; | 991 uint16x8_t dst8, scale8, alpha8; |
| 992 uint16x8_t dst_r, dst_g, dst_b; | 992 uint16x8_t dst_r, dst_g, dst_b; |
| 993 | 993 |
| 994 #if defined(DEBUG_OPAQUE_DITHER) | 994 #if defined(DEBUG_OPAQUE_DITHER) |
| 995 /* calculate 8 elements worth into a temp buffer */ | 995 // calculate 8 elements worth into a temp buffer |
| 996 { | 996 { |
| 997 int my_y = y; | 997 int my_y = y; |
| 998 int my_x = x; | 998 int my_x = x; |
| 999 SkPMColor* my_src = (SkPMColor*)src; | 999 SkPMColor* my_src = (SkPMColor*)src; |
| 1000 uint16_t* my_dst = dst; | 1000 uint16_t* my_dst = dst; |
| 1001 int i; | 1001 int i; |
| 1002 | 1002 |
| 1003 DITHER_565_SCAN(my_y); | 1003 DITHER_565_SCAN(my_y); |
| 1004 for(i=0;i<UNROLL;i++) { | 1004 for(i = 0; i < UNROLL; i++) { |
| 1005 SkPMColor c = *my_src++; | 1005 SkPMColor c = *my_src++; |
| 1006 SkPMColorAssert(c); | 1006 SkPMColorAssert(c); |
| 1007 if (c) { | 1007 if (c) { |
| 1008 unsigned a = SkGetPackedA32(c); | 1008 unsigned a = SkGetPackedA32(c); |
| 1009 | 1009 |
| 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); | 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); |
| 1011 tdv[i] = DITHER_VALUE(my_x); | 1011 tdv[i] = DITHER_VALUE(my_x); |
| 1012 ta[i] = a; | 1012 ta[i] = a; |
| 1013 tap[i] = SkAlpha255To256(a); | 1013 tap[i] = SkAlpha255To256(a); |
| 1014 td[i] = d; | 1014 td[i] = d; |
| 1015 | 1015 |
| 1016 unsigned sr = SkGetPackedR32(c); | 1016 unsigned sr = SkGetPackedR32(c); |
| 1017 unsigned sg = SkGetPackedG32(c); | 1017 unsigned sg = SkGetPackedG32(c); |
| 1018 unsigned sb = SkGetPackedB32(c); | 1018 unsigned sb = SkGetPackedB32(c); |
| 1019 sr = SkDITHER_R32_FOR_565(sr, d); | 1019 sr = SkDITHER_R32_FOR_565(sr, d); |
| 1020 sg = SkDITHER_G32_FOR_565(sg, d); | 1020 sg = SkDITHER_G32_FOR_565(sg, d); |
| 1021 sb = SkDITHER_B32_FOR_565(sb, d); | 1021 sb = SkDITHER_B32_FOR_565(sb, d); |
| 1022 | 1022 |
| 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); | 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
| 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); | 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); |
| 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
| 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
| 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; | 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; |
| 1028 td[i] = d; | 1028 td[i] = d; |
| 1029 | |
| 1030 } else { | 1029 } else { |
| 1031 tmpbuf[i] = *my_dst; | 1030 tmpbuf[i] = *my_dst; |
| 1032 ta[i] = tdv[i] = td[i] = 0xbeef; | 1031 ta[i] = tdv[i] = td[i] = 0xbeef; |
| 1033 } | 1032 } |
| 1034 in_dst[i] = *my_dst; | 1033 in_dst[i] = *my_dst; |
| 1035 my_dst += 1; | 1034 my_dst += 1; |
| 1036 DITHER_INC_X(my_x); | 1035 DITHER_INC_X(my_x); |
| 1037 } | 1036 } |
| 1038 } | 1037 } |
| 1039 #endif | 1038 #endif |
| 1040 | 1039 |
| 1041 /* source is in ABGR */ | 1040 |
| 1042 { | 1041 { |
| 1043 register uint8x8_t d0 asm("d0"); | 1042 register uint8x8_t d0 asm("d0"); |
| 1044 register uint8x8_t d1 asm("d1"); | 1043 register uint8x8_t d1 asm("d1"); |
| 1045 register uint8x8_t d2 asm("d2"); | 1044 register uint8x8_t d2 asm("d2"); |
| 1046 register uint8x8_t d3 asm("d3"); | 1045 register uint8x8_t d3 asm("d3"); |
| 1047 | 1046 |
| 1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| 1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) | 1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
| 1050 : "r" (src) | 1049 : |
| 1051 ); | 1050 ); |
| 1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| 1052 sr = d2; sg = d1; sb = d0; sa = d3; |
| 1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| 1052 sr = d0; sg = d1; sb = d2; sa = d3; | 1054 sr = d0; sg = d1; sb = d2; sa = d3; |
| 1055 #endif |
| 1053 } | 1056 } |
| 1054 | 1057 |
| 1055 /* calculate 'd', which will be 0..7 */ | 1058 /* calculate 'd', which will be 0..7 |
| 1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ | 1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
| 1057 #if defined(SK_BUILD_FOR_ANDROID) | 1060 */ |
| 1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ | 1061 alpha8 = vmovl_u8(dbase); |
| 1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); | 1062 alpha8 = vmlal_u8(alpha8, sa, dbase); |
| 1060 #else | 1063 d = vshrn_n_u16(alpha8, 8); // narrowing too |
| 1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); | |
| 1062 #endif | |
| 1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); | |
| 1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ | |
| 1065 | 1064 |
| 1066 /* sr = sr - (sr>>5) + d */ | 1065 // sr = sr - (sr>>5) + d |
| 1067 /* watching for 8-bit overflow. d is 0..7; risky range of | 1066 /* watching for 8-bit overflow. d is 0..7; risky range of |
| 1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; | 1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; |
| 1069 * safe as long as we do ((sr-sr>>5) + d) */ | 1068 * safe as long as we do ((sr-sr>>5) + d) |
| 1069 */ |
| 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
| 1071 sr = vadd_u8(sr, d); | 1071 sr = vadd_u8(sr, d); |
| 1072 | 1072 |
| 1073 /* sb = sb - (sb>>5) + d */ | 1073 // sb = sb - (sb>>5) + d |
| 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); | 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
| 1075 sb = vadd_u8(sb, d); | 1075 sb = vadd_u8(sb, d); |
| 1076 | 1076 |
| 1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ | 1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
| 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); | 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
| 1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); | 1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); |
| 1080 | 1080 |
| 1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ | 1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits |
| 1082 dst8 = vld1q_u16(dst); | 1082 dst8 = vld1q_u16(dst); |
| 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); | 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); |
| 1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); | 1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16
_BITS); |
| 1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ | 1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits |
| 1086 | 1086 |
| 1087 /* blend */ | 1087 // blend |
| 1088 #if 1 | |
| 1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ | |
| 1090 /* originally 255-sa + 1 */ | |
| 1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa); | 1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa); |
| 1092 #else | |
| 1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa); | |
| 1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); | |
| 1095 #endif | |
| 1096 | 1089 |
| 1097 #if 1 | 1090 // combine the addq and mul, save 3 insns |
| 1098 /* combine the addq and mul, save 3 insns */ | |
| 1099 scale8 = vshrq_n_u16(scale8, 3); | 1091 scale8 = vshrq_n_u16(scale8, 3); |
| 1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); | 1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); |
| 1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); | 1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
| 1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); | 1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
| 1103 #else | |
| 1104 /* known correct, but +3 insns over above */ | |
| 1105 scale8 = vshrq_n_u16(scale8, 3); | |
| 1106 dst_b = vmulq_u16(dst_b, scale8); | |
| 1107 dst_g = vmulq_u16(dst_g, scale8); | |
| 1108 dst_r = vmulq_u16(dst_r, scale8); | |
| 1109 | 1095 |
| 1110 /* combine */ | 1096 // repack to store |
| 1111 /* NB: vshll widens, need to preserve those bits */ | 1097 dst8 = vshrq_n_u16(dst_b, 5); |
| 1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); | |
| 1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); | |
| 1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); | |
| 1115 #endif | |
| 1116 | |
| 1117 /* repack to store */ | |
| 1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); | |
| 1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); | 1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
| 1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); | 1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
| 1121 | 1100 |
| 1122 vst1q_u16(dst, dst8); | 1101 vst1q_u16(dst, dst8); |
| 1123 | 1102 |
| 1124 #if defined(DEBUG_OPAQUE_DITHER) | 1103 #if defined(DEBUG_OPAQUE_DITHER) |
| 1125 /* verify my 8 elements match the temp buffer */ | 1104 // verify my 8 elements match the temp buffer |
| 1126 { | 1105 { |
| 1127 int i, bad=0; | 1106 int i, bad=0; |
| 1128 static int invocation; | 1107 static int invocation; |
| 1129 | 1108 |
| 1130 for (i=0;i<UNROLL;i++) | 1109 for (i = 0; i < UNROLL; i++) { |
| 1131 if (tmpbuf[i] != dst[i]) bad=1; | 1110 if (tmpbuf[i] != dst[i]) { |
| 1132 if (bad) { | 1111 bad=1; |
| 1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n"
, | 1112 } |
| 1134 invocation, offset); | 1113 } |
| 1135 SkDebugf(" alpha 0x%x\n", alpha); | 1114 if (bad) { |
| 1136 for (i=0;i<UNROLL;i++) | 1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %
d\n", |
| 1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n
", | 1116 invocation, offset); |
| 1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), | 1117 SkDebugf(" alpha 0x%x\n", alpha); |
| 1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); | 1118 for (i = 0; i < UNROLL; i++) |
| 1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0
4x\n", |
| 1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[
i], |
| 1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); |
| 1140 | 1122 |
| 1141 showme16("alpha8", &alpha8, sizeof(alpha8)); | 1123 showme16("alpha8", &alpha8, sizeof(alpha8)); |
| 1142 showme16("scale8", &scale8, sizeof(scale8)); | 1124 showme16("scale8", &scale8, sizeof(scale8)); |
| 1143 showme8("d", &d, sizeof(d)); | 1125 showme8("d", &d, sizeof(d)); |
| 1144 showme16("dst8", &dst8, sizeof(dst8)); | 1126 showme16("dst8", &dst8, sizeof(dst8)); |
| 1145 showme16("dst_b", &dst_b, sizeof(dst_b)); | 1127 showme16("dst_b", &dst_b, sizeof(dst_b)); |
| 1146 showme16("dst_g", &dst_g, sizeof(dst_g)); | 1128 showme16("dst_g", &dst_g, sizeof(dst_g)); |
| 1147 showme16("dst_r", &dst_r, sizeof(dst_r)); | 1129 showme16("dst_r", &dst_r, sizeof(dst_r)); |
| 1148 showme8("sb", &sb, sizeof(sb)); | 1130 showme8("sb", &sb, sizeof(sb)); |
| 1149 showme8("sg", &sg, sizeof(sg)); | 1131 showme8("sg", &sg, sizeof(sg)); |
| 1150 showme8("sr", &sr, sizeof(sr)); | 1132 showme8("sr", &sr, sizeof(sr)); |
| 1151 | 1133 |
| 1152 /* cop out */ | 1134 return; |
| 1153 return; | 1135 } |
| 1154 } | 1136 offset += UNROLL; |
| 1155 offset += UNROLL; | 1137 invocation++; |
| 1156 invocation++; | 1138 } |
| 1157 } | |
| 1158 #endif | 1139 #endif |
| 1159 | 1140 dst += UNROLL; |
| 1160 dst += UNROLL; | |
| 1161 src += UNROLL; | |
| 1162 count -= UNROLL; | 1141 count -= UNROLL; |
| 1163 /* skip x += UNROLL, since it's unchanged mod-4 */ | 1142 // skip x += UNROLL, since it's unchanged mod-4 |
| 1164 } while (count >= UNROLL); | 1143 } while (count >= UNROLL); |
| 1165 } | 1144 } |
| 1166 #undef UNROLL | 1145 #undef UNROLL |
| 1167 | 1146 |
| 1168 /* residuals */ | 1147 // residuals |
| 1169 if (count > 0) { | 1148 if (count > 0) { |
| 1170 DITHER_565_SCAN(y); | 1149 DITHER_565_SCAN(y); |
| 1171 do { | 1150 do { |
| 1172 SkPMColor c = *src++; | 1151 SkPMColor c = *src++; |
| 1173 SkPMColorAssert(c); | 1152 SkPMColorAssert(c); |
| 1174 if (c) { | 1153 if (c) { |
| 1175 unsigned a = SkGetPackedA32(c); | 1154 unsigned a = SkGetPackedA32(c); |
| 1176 | 1155 |
| 1177 // dither and alpha are just temporary variables to work-around | 1156 // dither and alpha are just temporary variables to work-around |
| 1178 // an ICE in debug. | 1157 // an ICE in debug. |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1434 * case where we do not inspect the src alpha. | 1413 * case where we do not inspect the src alpha. |
| 1435 */ | 1414 */ |
| 1436 #if SK_A32_SHIFT == 24 | 1415 #if SK_A32_SHIFT == 24 |
| 1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1439 #else | 1418 #else |
| 1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1441 #endif | 1420 #endif |
| 1442 S32A_Blend_BlitRow32_neon // S32A_Blend | 1421 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1443 }; | 1422 }; |
| OLD | NEW |