OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 1015 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1027 } | 1027 } |
1028 dst += 1; | 1028 dst += 1; |
1029 DITHER_INC_X(x); | 1029 DITHER_INC_X(x); |
1030 } while (--count != 0); | 1030 } while (--count != 0); |
1031 } | 1031 } |
1032 } | 1032 } |
1033 | 1033 |
1034 /////////////////////////////////////////////////////////////////////////////// | 1034 /////////////////////////////////////////////////////////////////////////////// |
1035 | 1035 |
1036 /* 2009/10/27: RBE says "a work in progress"; debugging says ok; | |
1037 * speedup untested, but ARM version is 26 insns/iteration and | |
1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) | |
1039 * which is 10x the native version; that's pure instruction counts, | |
1040 * not accounting for any instruction or memory latencies. | |
1041 */ | |
1042 | |
1043 #undef DEBUG_S32_OPAQUE_DITHER | 1036 #undef DEBUG_S32_OPAQUE_DITHER |
1044 | 1037 |
1045 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, | 1038 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
1046 const SkPMColor* SK_RESTRICT src, | 1039 const SkPMColor* SK_RESTRICT src, |
1047 int count, U8CPU alpha, int x, int y) { | 1040 int count, U8CPU alpha, int x, int y) { |
1048 SkASSERT(255 == alpha); | 1041 SkASSERT(255 == alpha); |
1049 | 1042 |
1050 #define UNROLL 8 | 1043 #define UNROLL 8 |
1051 if (count >= UNROLL) { | 1044 if (count >= UNROLL) { |
1052 uint8x8_t d; | 1045 uint8x8_t d; |
1053 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1046 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1054 d = vld1_u8(dstart); | 1047 d = vld1_u8(dstart); |
1055 | 1048 |
1056 while (count >= UNROLL) { | 1049 while (count >= UNROLL) { |
1057 uint8x8_t sr, sg, sb; | 1050 uint8x8_t sr, sg, sb; |
1058 uint16x8_t dr, dg, db; | 1051 uint16x8_t dr, dg, db; |
1059 uint16x8_t dst8; | 1052 uint16x8_t dst8; |
1060 | 1053 |
1061 /* source is in ABGR ordering (R == lsb) */ | 1054 /* source is in ABGR ordering (R == lsb) */ |
1062 { | 1055 { |
1063 register uint8x8_t d0 asm("d0"); | 1056 register uint8x8_t d0 asm("d0"); |
1064 register uint8x8_t d1 asm("d1"); | 1057 register uint8x8_t d1 asm("d1"); |
1065 register uint8x8_t d2 asm("d2"); | 1058 register uint8x8_t d2 asm("d2"); |
1066 register uint8x8_t d3 asm("d3"); | 1059 register uint8x8_t d3 asm("d3"); |
1067 | 1060 |
1068 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1061 asm ( |
1069 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) | 1062 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
1070 : "r" (src) | 1063 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
1071 ); | 1064 : |
1072 sr = d0; sg = d1; sb = d2; | 1065 ); |
1066 sg = d1; | |
1067 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | |
1068 sr = d2; sb = d0; | |
1069 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | |
1070 sr = d0; sb = d2; | |
1071 #endif | |
1073 } | 1072 } |
1074 /* XXX: if we want to prefetch, hide it in the above asm() | 1073 /* XXX: if we want to prefetch, hide it in the above asm() |
1075 * using the gcc __builtin_prefetch(), the prefetch will | 1074 * using the gcc __builtin_prefetch(), the prefetch will |
1076 * fall to the bottom of the loop -- it won't stick up | 1075 * fall to the bottom of the loop -- it won't stick up |
1077 * at the top of the loop, just after the vld4. | 1076 * at the top of the loop, just after the vld4. |
1078 */ | 1077 */ |
1079 | 1078 |
1080 /* sr = sr - (sr>>5) + d */ | 1079 /* sr = sr - (sr>>5) + d */ |
1081 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1080 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
1082 dr = vaddl_u8(sr, d); | 1081 dr = vaddl_u8(sr, d); |
1083 | 1082 |
1084 /* sb = sb - (sb>>5) + d */ | 1083 /* sb = sb - (sb>>5) + d */ |
1085 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); | 1084 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
1086 db = vaddl_u8(sb, d); | 1085 db = vaddl_u8(sb, d); |
1087 | 1086 |
1088 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ | 1087 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ |
1089 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); | 1088 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
1090 dg = vaddl_u8(sg, vshr_n_u8(d,1)); | 1089 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
1091 /* XXX: check that the "d>>1" here is hoisted */ | 1090 /* XXX: check that the "d>>1" here is hoisted */ |
1092 | 1091 |
1093 /* pack high bits of each into 565 format (rgb, b is lsb) */ | 1092 /* pack high bits of each into 565 format (rgb, b is lsb) */ |
1094 dst8 = vshrq_n_u16(db, 3); | 1093 dst8 = vshrq_n_u16(db, 3); |
1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); | 1094 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
1096 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); | 1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
1097 | 1096 |
1098 /* store it */ | 1097 /* store it */ |
1099 vst1q_u16(dst, dst8); | 1098 vst1q_u16(dst, dst8); |
1100 | 1099 |
1101 #if defined(DEBUG_S32_OPAQUE_DITHER) | 1100 #if defined(DEBUG_S32_OPAQUE_DITHER) |
1102 /* always good to know if we generated good results */ | 1101 /* always good to know if we generated good results */ |
1103 { | 1102 { |
1104 int i, myx = x, myy = y; | 1103 int i, myx = x, myy = y; |
1105 DITHER_565_SCAN(myy); | 1104 DITHER_565_SCAN(myy); |
1106 for (i=0;i<UNROLL;i++) { | 1105 for (i=0;i<UNROLL;i++) { |
1107 SkPMColor c = src[i]; | 1106 SkPMColor c = src[i-8]; |
mtklein
2013/09/12 20:07:36
Can you tack on something like
// The '!' in th
kevin.petit.not.used.account
2013/09/13 12:08:27
Done.
| |
1108 unsigned dither = DITHER_VALUE(myx); | 1107 unsigned dither = DITHER_VALUE(myx); |
1109 uint16_t val = SkDitherRGB32To565(c, dither); | 1108 uint16_t val = SkDitherRGB32To565(c, dither); |
1110 if (val != dst[i]) { | 1109 if (val != dst[i]) { |
1111 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n", | 1110 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n", |
1112 c, dither, val, dst[i], dstart[i]); | 1111 c, dither, val, dst[i], dstart[i]); |
1113 } | 1112 } |
1114 DITHER_INC_X(myx); | 1113 DITHER_INC_X(myx); |
1115 } | 1114 } |
1116 } | 1115 } |
1117 #endif | 1116 #endif |
1118 | 1117 |
1119 dst += UNROLL; | 1118 dst += UNROLL; |
1120 src += UNROLL; | |
mtklein
2013/09/12 20:07:36
Maybe even duplicate the same note here about why
kevin.petit.not.used.account
2013/09/13 12:08:27
Done.
| |
1121 count -= UNROLL; | 1119 count -= UNROLL; |
1122 x += UNROLL; /* probably superfluous */ | 1120 x += UNROLL; /* probably superfluous */ |
1123 } | 1121 } |
1124 } | 1122 } |
1125 #undef UNROLL | 1123 #undef UNROLL |
1126 | 1124 |
1127 /* residuals */ | 1125 /* residuals */ |
1128 if (count > 0) { | 1126 if (count > 0) { |
1129 DITHER_565_SCAN(y); | 1127 DITHER_565_SCAN(y); |
1130 do { | 1128 do { |
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1268 * case where we do not inspect the src alpha. | 1266 * case where we do not inspect the src alpha. |
1269 */ | 1267 */ |
1270 #if SK_A32_SHIFT == 24 | 1268 #if SK_A32_SHIFT == 24 |
1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1269 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1270 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1273 #else | 1271 #else |
1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1272 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1275 #endif | 1273 #endif |
1276 S32A_Blend_BlitRow32_arm // S32A_Blend | 1274 S32A_Blend_BlitRow32_arm // S32A_Blend |
1277 }; | 1275 }; |
OLD | NEW |