OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 952 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
963 #endif | 963 #endif |
964 | 964 |
965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
966 const SkPMColor* SK_RESTRICT src, | 966 const SkPMColor* SK_RESTRICT src, |
967 int count, U8CPU alpha, int x, int y) { | 967 int count, U8CPU alpha, int x, int y) { |
968 SkASSERT(255 == alpha); | 968 SkASSERT(255 == alpha); |
969 | 969 |
970 #define UNROLL 8 | 970 #define UNROLL 8 |
971 | 971 |
972 if (count >= UNROLL) { | 972 if (count >= UNROLL) { |
973 uint8x8_t dbase; | |
974 | 973 |
975 #if defined(DEBUG_OPAQUE_DITHER) | 974 #if defined(DEBUG_OPAQUE_DITHER) |
976 uint16_t tmpbuf[UNROLL]; | 975 uint16_t tmpbuf[UNROLL]; |
977 int td[UNROLL]; | 976 int td[UNROLL]; |
978 int tdv[UNROLL]; | 977 int tdv[UNROLL]; |
979 int ta[UNROLL]; | 978 int ta[UNROLL]; |
980 int tap[UNROLL]; | 979 int tap[UNROLL]; |
981 uint16_t in_dst[UNROLL]; | 980 uint16_t in_dst[UNROLL]; |
982 int offset = 0; | 981 int offset = 0; |
983 int noisy = 0; | 982 int noisy = 0; |
984 #endif | 983 #endif |
985 | 984 |
| 985 uint8x8_t dbase; |
986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
987 dbase = vld1_u8(dstart); | 987 dbase = vld1_u8(dstart); |
988 | 988 |
989 do { | 989 do { |
990 uint8x8_t sr, sg, sb, sa, d; | 990 uint8x8_t sr, sg, sb, sa, d; |
991 uint16x8_t dst8, scale8, alpha8; | 991 uint16x8_t dst8, scale8, alpha8; |
992 uint16x8_t dst_r, dst_g, dst_b; | 992 uint16x8_t dst_r, dst_g, dst_b; |
993 | 993 |
994 #if defined(DEBUG_OPAQUE_DITHER) | 994 #if defined(DEBUG_OPAQUE_DITHER) |
995 /* calculate 8 elements worth into a temp buffer */ | 995 // calculate 8 elements worth into a temp buffer |
996 { | 996 { |
997 int my_y = y; | 997 int my_y = y; |
998 int my_x = x; | 998 int my_x = x; |
999 SkPMColor* my_src = (SkPMColor*)src; | 999 SkPMColor* my_src = (SkPMColor*)src; |
1000 uint16_t* my_dst = dst; | 1000 uint16_t* my_dst = dst; |
1001 int i; | 1001 int i; |
1002 | 1002 |
1003 DITHER_565_SCAN(my_y); | 1003 DITHER_565_SCAN(my_y); |
1004 for(i=0;i<UNROLL;i++) { | 1004 for(i = 0; i < UNROLL; i++) { |
1005 SkPMColor c = *my_src++; | 1005 SkPMColor c = *my_src++; |
1006 SkPMColorAssert(c); | 1006 SkPMColorAssert(c); |
1007 if (c) { | 1007 if (c) { |
1008 unsigned a = SkGetPackedA32(c); | 1008 unsigned a = SkGetPackedA32(c); |
1009 | 1009 |
1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); | 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); |
1011 tdv[i] = DITHER_VALUE(my_x); | 1011 tdv[i] = DITHER_VALUE(my_x); |
1012 ta[i] = a; | 1012 ta[i] = a; |
1013 tap[i] = SkAlpha255To256(a); | 1013 tap[i] = SkAlpha255To256(a); |
1014 td[i] = d; | 1014 td[i] = d; |
1015 | 1015 |
1016 unsigned sr = SkGetPackedR32(c); | 1016 unsigned sr = SkGetPackedR32(c); |
1017 unsigned sg = SkGetPackedG32(c); | 1017 unsigned sg = SkGetPackedG32(c); |
1018 unsigned sb = SkGetPackedB32(c); | 1018 unsigned sb = SkGetPackedB32(c); |
1019 sr = SkDITHER_R32_FOR_565(sr, d); | 1019 sr = SkDITHER_R32_FOR_565(sr, d); |
1020 sg = SkDITHER_G32_FOR_565(sg, d); | 1020 sg = SkDITHER_G32_FOR_565(sg, d); |
1021 sb = SkDITHER_B32_FOR_565(sb, d); | 1021 sb = SkDITHER_B32_FOR_565(sb, d); |
1022 | 1022 |
1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); | 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); | 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); |
1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; | 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; |
1028 td[i] = d; | 1028 td[i] = d; |
1029 | |
1030 } else { | 1029 } else { |
1031 tmpbuf[i] = *my_dst; | 1030 tmpbuf[i] = *my_dst; |
1032 ta[i] = tdv[i] = td[i] = 0xbeef; | 1031 ta[i] = tdv[i] = td[i] = 0xbeef; |
1033 } | 1032 } |
1034 in_dst[i] = *my_dst; | 1033 in_dst[i] = *my_dst; |
1035 my_dst += 1; | 1034 my_dst += 1; |
1036 DITHER_INC_X(my_x); | 1035 DITHER_INC_X(my_x); |
1037 } | 1036 } |
1038 } | 1037 } |
1039 #endif | 1038 #endif |
1040 | 1039 |
1041 /* source is in ABGR */ | 1040 |
1042 { | 1041 { |
1043 register uint8x8_t d0 asm("d0"); | 1042 register uint8x8_t d0 asm("d0"); |
1044 register uint8x8_t d1 asm("d1"); | 1043 register uint8x8_t d1 asm("d1"); |
1045 register uint8x8_t d2 asm("d2"); | 1044 register uint8x8_t d2 asm("d2"); |
1046 register uint8x8_t d3 asm("d3"); | 1045 register uint8x8_t d3 asm("d3"); |
1047 | 1046 |
1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) | 1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
1050 : "r" (src) | 1049 : |
1051 ); | 1050 ); |
| 1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| 1052 sr = d2; sg = d1; sb = d0; sa = d3; |
| 1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
1052 sr = d0; sg = d1; sb = d2; sa = d3; | 1054 sr = d0; sg = d1; sb = d2; sa = d3; |
| 1055 #endif |
1053 } | 1056 } |
1054 | 1057 |
1055 /* calculate 'd', which will be 0..7 */ | 1058 /* calculate 'd', which will be 0..7 |
1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ | 1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
1057 #if defined(SK_BUILD_FOR_ANDROID) | 1060 */ |
1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ | 1061 alpha8 = vmovl_u8(dbase); |
1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); | 1062 alpha8 = vmlal_u8(alpha8, sa, dbase); |
1060 #else | 1063 d = vshrn_n_u16(alpha8, 8); // narrowing too |
1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); | |
1062 #endif | |
1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); | |
1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ | |
1065 | 1064 |
1066 /* sr = sr - (sr>>5) + d */ | 1065 // sr = sr - (sr>>5) + d |
1067 /* watching for 8-bit overflow. d is 0..7; risky range of | 1066 /* watching for 8-bit overflow. d is 0..7; risky range of |
1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; | 1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; |
1069 * safe as long as we do ((sr-sr>>5) + d) */ | 1068 * safe as long as we do ((sr-sr>>5) + d) |
| 1069 */ |
1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
1071 sr = vadd_u8(sr, d); | 1071 sr = vadd_u8(sr, d); |
1072 | 1072 |
1073 /* sb = sb - (sb>>5) + d */ | 1073 // sb = sb - (sb>>5) + d |
1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); | 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
1075 sb = vadd_u8(sb, d); | 1075 sb = vadd_u8(sb, d); |
1076 | 1076 |
1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ | 1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); | 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); | 1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); |
1080 | 1080 |
1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ | 1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits |
1082 dst8 = vld1q_u16(dst); | 1082 dst8 = vld1q_u16(dst); |
1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); | 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); |
1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); | 1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16
_BITS); |
1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ | 1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits |
1086 | 1086 |
1087 /* blend */ | 1087 // blend |
1088 #if 1 | |
1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ | |
1090 /* originally 255-sa + 1 */ | |
1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa); | 1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa); |
1092 #else | |
1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa); | |
1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); | |
1095 #endif | |
1096 | 1089 |
1097 #if 1 | 1090 // combine the addq and mul, save 3 insns |
1098 /* combine the addq and mul, save 3 insns */ | |
1099 scale8 = vshrq_n_u16(scale8, 3); | 1091 scale8 = vshrq_n_u16(scale8, 3); |
1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); | 1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); |
1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); | 1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); | 1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
1103 #else | |
1104 /* known correct, but +3 insns over above */ | |
1105 scale8 = vshrq_n_u16(scale8, 3); | |
1106 dst_b = vmulq_u16(dst_b, scale8); | |
1107 dst_g = vmulq_u16(dst_g, scale8); | |
1108 dst_r = vmulq_u16(dst_r, scale8); | |
1109 | 1095 |
1110 /* combine */ | 1096 // repack to store |
1111 /* NB: vshll widens, need to preserve those bits */ | 1097 dst8 = vshrq_n_u16(dst_b, 5); |
1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); | |
1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); | |
1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); | |
1115 #endif | |
1116 | |
1117 /* repack to store */ | |
1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); | |
1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); | 1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); | 1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
1121 | 1100 |
1122 vst1q_u16(dst, dst8); | 1101 vst1q_u16(dst, dst8); |
1123 | 1102 |
1124 #if defined(DEBUG_OPAQUE_DITHER) | 1103 #if defined(DEBUG_OPAQUE_DITHER) |
1125 /* verify my 8 elements match the temp buffer */ | 1104 // verify my 8 elements match the temp buffer |
1126 { | 1105 { |
1127 int i, bad=0; | 1106 int i, bad=0; |
1128 static int invocation; | 1107 static int invocation; |
1129 | 1108 |
1130 for (i=0;i<UNROLL;i++) | 1109 for (i = 0; i < UNROLL; i++) { |
1131 if (tmpbuf[i] != dst[i]) bad=1; | 1110 if (tmpbuf[i] != dst[i]) { |
1132 if (bad) { | 1111 bad=1; |
1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n"
, | 1112 } |
1134 invocation, offset); | 1113 } |
1135 SkDebugf(" alpha 0x%x\n", alpha); | 1114 if (bad) { |
1136 for (i=0;i<UNROLL;i++) | 1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %
d\n", |
1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n
", | 1116 invocation, offset); |
1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), | 1117 SkDebugf(" alpha 0x%x\n", alpha); |
1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); | 1118 for (i = 0; i < UNROLL; i++) |
| 1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0
4x\n", |
| 1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[
i], |
| 1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); |
1140 | 1122 |
1141 showme16("alpha8", &alpha8, sizeof(alpha8)); | 1123 showme16("alpha8", &alpha8, sizeof(alpha8)); |
1142 showme16("scale8", &scale8, sizeof(scale8)); | 1124 showme16("scale8", &scale8, sizeof(scale8)); |
1143 showme8("d", &d, sizeof(d)); | 1125 showme8("d", &d, sizeof(d)); |
1144 showme16("dst8", &dst8, sizeof(dst8)); | 1126 showme16("dst8", &dst8, sizeof(dst8)); |
1145 showme16("dst_b", &dst_b, sizeof(dst_b)); | 1127 showme16("dst_b", &dst_b, sizeof(dst_b)); |
1146 showme16("dst_g", &dst_g, sizeof(dst_g)); | 1128 showme16("dst_g", &dst_g, sizeof(dst_g)); |
1147 showme16("dst_r", &dst_r, sizeof(dst_r)); | 1129 showme16("dst_r", &dst_r, sizeof(dst_r)); |
1148 showme8("sb", &sb, sizeof(sb)); | 1130 showme8("sb", &sb, sizeof(sb)); |
1149 showme8("sg", &sg, sizeof(sg)); | 1131 showme8("sg", &sg, sizeof(sg)); |
1150 showme8("sr", &sr, sizeof(sr)); | 1132 showme8("sr", &sr, sizeof(sr)); |
1151 | 1133 |
1152 /* cop out */ | 1134 return; |
1153 return; | 1135 } |
1154 } | 1136 offset += UNROLL; |
1155 offset += UNROLL; | 1137 invocation++; |
1156 invocation++; | 1138 } |
1157 } | |
1158 #endif | 1139 #endif |
1159 | 1140 dst += UNROLL; |
1160 dst += UNROLL; | |
1161 src += UNROLL; | |
1162 count -= UNROLL; | 1141 count -= UNROLL; |
1163 /* skip x += UNROLL, since it's unchanged mod-4 */ | 1142 // skip x += UNROLL, since it's unchanged mod-4 |
1164 } while (count >= UNROLL); | 1143 } while (count >= UNROLL); |
1165 } | 1144 } |
1166 #undef UNROLL | 1145 #undef UNROLL |
1167 | 1146 |
1168 /* residuals */ | 1147 // residuals |
1169 if (count > 0) { | 1148 if (count > 0) { |
1170 DITHER_565_SCAN(y); | 1149 DITHER_565_SCAN(y); |
1171 do { | 1150 do { |
1172 SkPMColor c = *src++; | 1151 SkPMColor c = *src++; |
1173 SkPMColorAssert(c); | 1152 SkPMColorAssert(c); |
1174 if (c) { | 1153 if (c) { |
1175 unsigned a = SkGetPackedA32(c); | 1154 unsigned a = SkGetPackedA32(c); |
1176 | 1155 |
1177 // dither and alpha are just temporary variables to work-around | 1156 // dither and alpha are just temporary variables to work-around |
1178 // an ICE in debug. | 1157 // an ICE in debug. |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1434 * case where we do not inspect the src alpha. | 1413 * case where we do not inspect the src alpha. |
1435 */ | 1414 */ |
1436 #if SK_A32_SHIFT == 24 | 1415 #if SK_A32_SHIFT == 24 |
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1439 #else | 1418 #else |
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1441 #endif | 1420 #endif |
1442 S32A_Blend_BlitRow32_neon // S32A_Blend | 1421 S32A_Blend_BlitRow32_neon // S32A_Blend |
1443 }; | 1422 }; |
OLD | NEW |