src/codec/SkPngFilters.cpp - Issue 1657503002: Look beyond SSE2 for Paeth

Side by Side Diff: src/codec/SkPngFilters.cpp

Issue 1657503002: Look beyond SSE2 for Paeth (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: kill sse4.1 Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkPngFilters.h"	8 #include "SkPngFilters.h"

9	9

10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).	10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
73	73

74 d = _mm_add_epi8(d, avg);	74 d = _mm_add_epi8(d, avg);

75 store<bpp>(row, d);	75 store<bpp>(row, d);

76	76

77 prev += bpp;	77 prev += bpp;

78 row += bpp;	78 row += bpp;

79 rb -= bpp;	79 rb -= bpp;

80 }	80 }

81 }	81 }

82	82

83 // Returns bytewise \|x-y\|.	83 // Returns \|x\| for 16-bit lanes.

84 static __m128i absdiff_u8(__m128i x, __m128i y) {	84 static __m128i abs_i16(__m128i x) {

85 // One of these two saturated subtractions will be the answer, the other zero.	85 #if defined(__SSSE3__)

86 return _mm_or_si128(_mm_subs_epu8(x,y), _mm_subs_epu8(y,x));	86 return _mm_abs_epi16(x);

	87 #else

	88 // Read this all as, return x<0 ? -x : x.

	89 // To negate two's complement, you flip all the bits then add 1.

	90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());

	91 x = _mm_xor_si128(x, is_negative); // Flip negative lanes.

	92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ e lanes, else +0.

	93 return x;

	94 #endif

87 }	95 }

88	96

89 // Bytewise c ? t : e.	97 // Bytewise c ? t : e.

90 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {	98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {

91 // SSE 4.1+ would be: return _mm_blendv_epi8(e,t,c);	99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling.

	100 return _mm_blendv_epi8(e,t,c);

	101 #else

92 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));	102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));

	103 #endif

93 }	104 }

94	105

95 template <int bpp>	106 template <int bpp>

96 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) {	107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) {

97 // Paeth tries to predict pixel d using the pixel to the left of it, a,	108 // Paeth tries to predict pixel d using the pixel to the left of it, a,

98 // and two pixels from the previous row, b and c:	109 // and two pixels from the previous row, b and c:

99 // prev: c b	110 // prev: c b

100 // row: a d	111 // row: a d

101 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c.	112 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c.

102	113

103 // The first pixel has no left context, and so uses an Up filter, p = b.	114 // The first pixel has no left context, and so uses an Up filter, p = b.

104 // This works naturally with our main loop's p = a+b-c if we force a and c to zero.	115 // This works naturally with our main loop's p = a+b-c if we force a and c to zero.

105 // Here we zero b and d, which become c and a respectively at the start of the loop.	116 // Here we zero b and d, which become c and a respectively at the start of the loop.

106 __m128i c, b = _mm_setzero_si128(),	117 const __m128i zero = _mm_setzero_si128();

107 a, d = _mm_setzero_si128();	118 __m128i c, b = zero,

	119 a, d = zero;

108	120

109 int rb = row_info->rowbytes;	121 int rb = row_info->rowbytes;

110 while (rb > 0) {	122 while (rb > 0) {

111 c = b; b = load<bpp>(prev);	123 // It's easiest to do this math (particularly, deal with pc) with 16 -bit intermediates.

112 a = d; d = load<bpp>(row );	124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero);

	125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero);

113	126

114 // We can't express p in 8 bits, but luckily we can use this faux p instead.	127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c)

115 // (I have no deep insight here... I just proved this with brute for ce.)	128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c)

116 __m128i min = _mm_min_epu8(a,b),	129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c -c) == (b-c)+(a-c)

117 max = _mm_max_epu8(a,b),

118 faux_p = _mm_adds_epu8(min, _mm_subs_epu8(max, c));

119	130

120 // We could use faux_p for calculating all three of pa, pb, and pc,	131 pa = abs_i16(pa); // \|p-a\|

121 // but it's a little quicker to calculate the correct pa and pb dire ctly,	132 pb = abs_i16(pb); // \|p-b\|

122 // and the predictor remains the same. (Again, brute force.)	133 pc = abs_i16(pc); // \|p-c\|

123 __m128i pa = absdiff_u8(b,c), // \|a+b-c - a\| == \|b-c\|

124 pb = absdiff_u8(a,c), // \|a+b-c - b\| == \|a-c\|

125 faux_pc = absdiff_u8(faux_p, c);

126	134

127 // From here, things are straightforward. Find the smallest distanc e to p...	135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

128 __m128i smallest = _mm_min_epu8(_mm_min_epu8(pa, pb), faux_pc);

129	136

130 // ... then the predictor is the input corresponding to that smalles t distance,	137 // Paeth breaks ties favoring a over b over c.

131 // breaking ties in favor of a over b over c.	138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

132 __m128i nearest = if_then_else(_mm_cmpeq_epi8(smallest, pa), a,	139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

133 if_then_else(_mm_cmpeq_epi8(smallest, pb), b,	140 c));

134 c));

135	141

136 // We've reconstructed d! Leave it for next round to become a, and write it out.	142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255.

137 d = _mm_add_epi8(d, nearest);	143 store<bpp>(row, _mm_packus_epi16(d,d));

138 store<bpp>(row, d);

139	144

140 prev += bpp;	145 prev += bpp;

141 row += bpp;	146 row += bpp;

142 rb -= bpp;	147 rb -= bpp;

143 }	148 }

144 }	149 }

145	150

146 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

147 sk_sub_sse2<3>(row_info, row, prev);	152 sk_sub_sse2<3>(row_info, row, prev);

148 }	153 }

149 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

150 sk_sub_sse2<4>(row_info, row, prev);	155 sk_sub_sse2<4>(row_info, row, prev);

151 }	156 }

152	157

153 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

154 sk_avg_sse2<3>(row_info, row, prev);	159 sk_avg_sse2<3>(row_info, row, prev);

155 }	160 }

156 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

157 sk_avg_sse2<4>(row_info, row, prev);	162 sk_avg_sse2<4>(row_info, row, prev);

158 }	163 }

159	164

160 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {	165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

161 sk_paeth_sse2<3>(row_info, row, prev);	166 sk_paeth_sse2<3>(row_info, row, prev);

162 }	167 }

163 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {	168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

164 sk_paeth_sse2<4>(row_info, row, prev);	169 sk_paeth_sse2<4>(row_info, row, prev);

165 }	170 }

166	171

167 #endif	172 #endif

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »