src/codec/SkPngFilters.cpp - Issue 1699953002: Make png filter functions compatible with libpng

Side by Side Diff: src/codec/SkPngFilters.cpp

Issue 1699953002: Make png filter functions compatible with libpng (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Add loadX() and storeX() functions Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkPngFilters.h"	8 #include "SkPngFilters.h"

9	9

10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).	10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).

11 // They're positioned like this:	11 // They're positioned like this:

12 // prev: c b	12 // prev: c b

13 // row: a d	13 // row: a d

14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever	14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever

15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.)	15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.)

16	16

17 #if defined(__SSE2__)	17 #if defined(__SSE2__)

18	18

19 template <int bpp>	19 static __m128i load3(const void* p) {

20 static __m128i load(const void* p) {	20 uint32_t packed;

21 static_assert(bpp <= 4, "");	21 memcpy(&packed, p, 3);

22	22 return _mm_cvtsi32_si128(packed);

23 uint32_t packed;	23 }

24 memcpy(&packed, p, bpp);	24

25 return _mm_cvtsi32_si128(packed);	25 static __m128i load4(const void* p) {

26 }	26 uint32_t packed;

27	27 memcpy(&packed, p, 4);

28 template <int bpp>	28 return _mm_cvtsi32_si128(packed);
	mtklein 2016/02/16 14:11:57 Now that we've split these apart, we might conside Now that we've split these apart, we might consider writing this more simply: return _mm_cvtsi32_si128((const int)p); and same for store4: (int)p = _mm_cvtsi128_si32(v);
29 static void store(void* p, __m128i v) {	29 }

30 static_assert(bpp <= 4, "");	30

31	31 static void store3(void* p, __m128i v) {

32 uint32_t packed = _mm_cvtsi128_si32(v);	32 uint32_t packed = _mm_cvtsi128_si32(v);

33 memcpy(p, &packed, bpp);	33 memcpy(p, &packed, 3);

34 }	34 }

35	35

36 template <int bpp>	36 static void store4(void* p, __m128i v) {

37 static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* ) {	37 uint32_t packed = _mm_cvtsi128_si32(v);

38 // The Sub filter predicts each pixel as the previous pixel, a.	38 memcpy(p, &packed, 4);

39 // There is no pixel to the left of the first pixel. It's encoded direc tly.	39 }

40 // That works with our main loop if we just say that left pixel was zero .	40

41 __m128i a, d = _mm_setzero_si128();	41 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row,
	mtklein 2016/02/16 14:11:57 Why do these guys go to two lines? Wouldn't it on Why do these guys go to two lines? Wouldn't it only be 78 columns to write void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { ? msarett 2016/02/16 14:48:59 Not sure. I copied the style of the signatures fr Show quoted text On 2016/02/16 14:11:57, mtklein wrote: > Why do these guys go to two lines? Wouldn't it only be 78 columns to write > > void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { > > ? Not sure. I copied the style of the signatures from libpng's NEON opts. Note that libpng uses png_bytep so it's different. Irrelevant since I'm returning this to Skia style.
42	42 const uint8_t* prev)

43 int rb = row_info->rowbytes;	43 {

44 while (rb > 0) {	44 // The Sub filter predicts each pixel as the previous pixel, a.

45 a = d; d = load<bpp>(row);	45 // There is no pixel to the left of the first pixel. It's encoded directly.

46 d = _mm_add_epi8(d, a);	46 // That works with our main loop if we just say that left pixel was zero.

47 store<bpp>(row, d);	47 __m128i a, d = _mm_setzero_si128();

48	48

49 row += bpp;	49 int rb = row_info->rowbytes;

50 rb -= bpp;	50 while (rb > 0) {

51 }	51 a = d; d = load3(row);

52 }	52 d = _mm_add_epi8(d, a);

53	53 store3(row, d);

54 template <int bpp>	54

55 void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	55 row += 3;

56 // The Avg filter predicts each pixel as the (truncated) average of a an d b.	56 rb -= 3;

57 // There's no pixel to the left of the first pixel. Luckily, it's	57 }

58 // predicted to be half of the pixel above it. So again, this works	58 }

59 // perfectly with our loop if we make sure a starts at zero.	59

60 const __m128i zero = _mm_setzero_si128();	60 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row,

61 __m128i b;	61 const uint8_t* prev)

62 __m128i a, d = zero;	62 {

63	63 // The Sub filter predicts each pixel as the previous pixel, a.

64 int rb = row_info->rowbytes;	64 // There is no pixel to the left of the first pixel. It's encoded directly.

65 while (rb > 0) {	65 // That works with our main loop if we just say that left pixel was zero.

66 b = load<bpp>(prev);	66 __m128i a, d = _mm_setzero_si128();

67 a = d; d = load<bpp>(row );	67

68	68 int rb = row_info->rowbytes;

69 // PNG requires a truncating average here, so sadly we can't just us e _mm_avg_epu8...	69 while (rb > 0) {

70 __m128i avg = _mm_avg_epu8(a,b);	70 a = d; d = load4(row);

71 // ...but we can fix it up by subtracting off 1 if it rounded up.	71 d = _mm_add_epi8(d, a);

72 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e pi8(1)));	72 store4(row, d);

73	73

74 d = _mm_add_epi8(d, avg);	74 row += 4;

75 store<bpp>(row, d);	75 rb -= 4;

76	76 }

77 prev += bpp;	77 }

78 row += bpp;	78

79 rb -= bpp;	79 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row,

80 }	80 const uint8_t* prev)

81 }	81 {

82	82 // The Avg filter predicts each pixel as the (truncated) average of a and b.

83 // Returns \|x\| for 16-bit lanes.	83 // There's no pixel to the left of the first pixel. Luckily, it's

84 static __m128i abs_i16(__m128i x) {	84 // predicted to be half of the pixel above it. So again, this works

85 #if defined(__SSSE3__)	85 // perfectly with our loop if we make sure a starts at zero.

86 return _mm_abs_epi16(x);	86 const __m128i zero = _mm_setzero_si128();

87 #else	87 __m128i b;
	mtklein 2016/02/16 14:11:57 We might want to scoot b over a few columns right We might want to scoot b over a few columns right to keep the c b a d layout. Same deal below with the load: b = load3(prev); a = d; d = load3(row); msarett 2016/02/16 14:48:59 Done. Show quoted text On 2016/02/16 14:11:57, mtklein wrote: > We might want to scoot b over a few columns right to keep the > c b > a d > layout. Same deal below with the load: > > b = load3(prev); > a = d; d = load3(row); Done.
88 // Read this all as, return x<0 ? -x : x.	88 __m128i a, d = zero;

89 // To negate two's complement, you flip all the bits then add 1.	89

90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());	90 int rb = row_info->rowbytes;

91 x = _mm_xor_si128(x, is_negative); // Flip negative lanes.	91 while (rb > 0) {

92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ e lanes, else +0.	92 b = load3(prev);

93 return x;	93 a = d; d = load3(row);

94 #endif	94

95 }	95 // PNG requires a truncating average here, so sadly we can't just use

96	96 // _mm_avg_epu8...

97 // Bytewise c ? t : e.	97 __m128i avg = _mm_avg_epu8(a,b);

98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {	98 // ...but we can fix it up by subtracting off 1 if it rounded up.

99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling.	99 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

100 return _mm_blendv_epi8(e,t,c);	100 _mm_set1_epi8(1)));

101 #else	101

102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));	102 d = _mm_add_epi8(d, avg);

103 #endif	103 store3(row, d);

104 }	104

105	105 prev += 3;

106 template <int bpp>	106 row += 3;

107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) {	107 rb -= 3;

108 // Paeth tries to predict pixel d using the pixel to the left of it, a,	108 }

109 // and two pixels from the previous row, b and c:	109 }

110 // prev: c b	110 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row,

111 // row: a d	111 const uint8_t* prev)

112 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c.	112 {

113	113 // The Avg filter predicts each pixel as the (truncated) average of a and b.

114 // The first pixel has no left context, and so uses an Up filter, p = b.	114 // There's no pixel to the left of the first pixel. Luckily, it's

115 // This works naturally with our main loop's p = a+b-c if we force a and c to zero.	115 // predicted to be half of the pixel above it. So again, this works

116 // Here we zero b and d, which become c and a respectively at the start of the loop.	116 // perfectly with our loop if we make sure a starts at zero.

117 const __m128i zero = _mm_setzero_si128();	117 const __m128i zero = _mm_setzero_si128();

118 __m128i c, b = zero,	118 __m128i b;

119 a, d = zero;	119 __m128i a, d = zero;

120	120

121 int rb = row_info->rowbytes;	121 int rb = row_info->rowbytes;

122 while (rb > 0) {	122 while (rb > 0) {

123 // It's easiest to do this math (particularly, deal with pc) with 16 -bit intermediates.	123 b = load4(prev);

124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero);	124 a = d; d = load4(row);

125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero);	125

126	126 // PNG requires a truncating average here, so sadly we can't just use

127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c)	127 // _mm_avg_epu8...

128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c)	128 __m128i avg = _mm_avg_epu8(a,b);

129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c -c) == (b-c)+(a-c)	129 // ...but we can fix it up by subtracting off 1 if it rounded up.

130	130 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

131 pa = abs_i16(pa); // \|p-a\|	131 _mm_set1_epi8(1)));

132 pb = abs_i16(pb); // \|p-b\|	132

133 pc = abs_i16(pc); // \|p-c\|	133 d = _mm_add_epi8(d, avg);

134	134 store4(row, d);

135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));	135

136	136 prev += 4;

137 // Paeth breaks ties favoring a over b over c.	137 row += 4;

138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,	138 rb -= 4;

139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,	139 }

140 c));	140 }

141	141

142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255.	142 // Returns \|x\| for 16-bit lanes.

143 store<bpp>(row, _mm_packus_epi16(d,d));	143 static __m128i abs_i16(__m128i x) {

144	144 #if defined(__SSSE3__)

145 prev += bpp;	145 return _mm_abs_epi16(x);

146 row += bpp;	146 #else

147 rb -= bpp;	147 // Read this all as, return x<0 ? -x : x.

148 }	148 // To negate two's complement, you flip all the bits then add 1.

149 }	149 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());

150	150 // Flip negative lanes.

151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	151 x = _mm_xor_si128(x, is_negative);

152 sk_sub_sse2<3>(row_info, row, prev);	152 // +1 to negative lanes, else +0.

153 }	153 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));

154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	154 return x;

155 sk_sub_sse2<4>(row_info, row, prev);

156 }

157

158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

159 sk_avg_sse2<3>(row_info, row, prev);

160 }

161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

162 sk_avg_sse2<4>(row_info, row, prev);

163 }

164

165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

166 sk_paeth_sse2<3>(row_info, row, prev);

167 }

168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

169 sk_paeth_sse2<4>(row_info, row, prev);

170 }

171

172 #endif	155 #endif

	156 }

	157

	158 // Bytewise c ? t : e.

	159 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {

	160 #if defined(__SSE4_1__)

	161 return _mm_blendv_epi8(e,t,c);

	162 #else

	163 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));

	164 #endif

	165 }

	166

	167 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row,

	168 const uint8_t* prev)

	169 {

	170 // Paeth tries to predict pixel d using the pixel to the left of it, a,

	171 // and two pixels from the previous row, b and c:

	172 // prev: c b

	173 // row: a d

	174 // The Paeth function predicts d to be whichever of a, b, or c is nearest to

	175 // p=a+b-c. The first pixel has no left context, and so uses an Up filter,

	176 // p = b. This works naturally with our main loop's p = a+b-c if we force a

	177 // and c to zero. Here we zero b and d, which become c and a respectively

	178 // at the start of the loop.

	179 const __m128i zero = _mm_setzero_si128();

	180 __m128i c, b = zero,

	181 a, d = zero;

	182

	183 int rb = row_info->rowbytes;

	184 while (rb > 0) {

	185 // It's easiest to do this math (particularly, deal with pc) with 16-bit

	186 // intermediates.

	187 b = load3(prev);

	188 d = load3(row);
	mtklein 2016/02/16 14:11:57 I think this breaks things by loading new values f I think this breaks things by loading new values for b and d before we rotate the old values into c and a. msarett 2016/02/16 14:48:59 Done. Show quoted text On 2016/02/16 14:11:57, mtklein wrote: > I think this breaks things by loading new values for b and d before we rotate > the old values into c and a. Done.
	189 c = b; b = _mm_unpacklo_epi8(b, zero);

	190 a = d; d = _mm_unpacklo_epi8(d, zero);

	191 __m128i pa = _mm_sub_epi16(b,c),

	192 // (p-a) == (a+b-c - a) == (b-c)
	mtklein 2016/02/16 14:11:57 Moving these comments around and changing the alig Moving these comments around and changing the alignment here has made things harder to read. If there's no way to have everything under formatting constraints, let's sacrifice the comments, or move them out of line. msarett 2016/02/16 14:48:59 Done. Show quoted text On 2016/02/16 14:11:57, mtklein wrote: > Moving these comments around and changing the alignment here has made things > harder to read. > > If there's no way to have everything under formatting constraints, let's > sacrifice the comments, > or move them out of line. Done.
	193 pb = _mm_sub_epi16(a,c),

	194 // (p-b) == (a+b-c - b) == (a-c)

	195 pc = _mm_add_epi16(pa,pb);

	196 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

	197

	198 pa = abs_i16(pa);// \|p-a\|

	199 pb = abs_i16(pb);// \|p-b\|

	200 pc = abs_i16(pc);// \|p-c\|

	201

	202 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	203

	204 // Paeth breaks ties favoring a over b over c.

	205 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	206 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
	mtklein 2016/02/16 14:11:57 I think this hurts readability to indent like this I think this hurts readability to indent like this. It's lost the alignment that made it clear. msarett 2016/02/16 14:48:58 Done. Show quoted text On 2016/02/16 14:11:57, mtklein wrote: > I think this hurts readability to indent like this. It's lost the alignment > that made it clear. Done.
	207 c));

	208

	209 // Note `_epi8`: we need addition to wrap modulo 255.

	210 d = _mm_add_epi8(d, nearest);

	211 store3(row, _mm_packus_epi16(d,d));

	212 prev += 3;

	213 row += 3;

	214 rb -= 3;

	215 }

	216 }

	217

	218 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row,

	219 const uint8_t* prev)

	220 {

	221 // Paeth tries to predict pixel d using the pixel to the left of it, a,

	222 // and two pixels from the previous row, b and c:

	223 // prev: c b

	224 // row: a d

	225 // The Paeth function predicts d to be whichever of a, b, or c is nearest to

	226 // p=a+b-c. The first pixel has no left context, and so uses an Up filter,

	227 // p = b. This works naturally with our main loop's p = a+b-c if we force a

	228 // and c to zero. Here we zero b and d, which become c and a respectively

	229 // at the start of the loop.

	230 const __m128i zero = _mm_setzero_si128();

	231 __m128i c, b = zero,

	232 a, d = zero;

	233

	234 int rb = row_info->rowbytes;

	235 while (rb > 0) {

	236 // It's easiest to do this math (particularly, deal with pc) with 16-bit

	237 // intermediates.

	238 b = load4(prev);

	239 d = load4(row);

	240 c = b; b = _mm_unpacklo_epi8(b, zero);

	241 a = d; d = _mm_unpacklo_epi8(d, zero);

	242 __m128i pa = _mm_sub_epi16(b,c),

	243 // (p-a) == (a+b-c - a) == (b-c)

	244 pb = _mm_sub_epi16(a,c),

	245 // (p-b) == (a+b-c - b) == (a-c)

	246 pc = _mm_add_epi16(pa,pb);

	247 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

	248

	249 pa = abs_i16(pa);// \|p-a\|

	250 pb = abs_i16(pb);// \|p-b\|

	251 pc = abs_i16(pc);// \|p-c\|

	252

	253 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	254

	255 // Paeth breaks ties favoring a over b over c.

	256 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	257 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	258 c));

	259

	260 // Note `_epi8`: we need addition to wrap modulo 255.

	261 d = _mm_add_epi8(d, nearest);

	262 store4(row, _mm_packus_epi16(d,d));

	263 prev += 4;

	264 row += 4;

	265 rb -= 4;

	266 }

	267 }

	268

	269 #endif

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »