src/codec/SkPngFilters.cpp - Issue 1699953002: Make png filter functions compatible with libpng

Side by Side Diff: src/codec/SkPngFilters.cpp

Issue 1699953002: Make png filter functions compatible with libpng (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkPngFilters.h"	8 #include "SkPngFilters.h"

9	9

10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).	10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ).

11 // They're positioned like this:	11 // They're positioned like this:

12 // prev: c b	12 // prev: c b

13 // row: a d	13 // row: a d

14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever	14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever

15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.)	15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.)

16	16

17 #if defined(__SSE2__)	17 #if defined(__SSE2__)

18	18

19 template <int bpp>	19 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row,

20 static __m128i load(const void* p) {	20 const uint8_t* prev)

21 static_assert(bpp <= 4, "");	21 {

22	22 // The Sub filter predicts each pixel as the previous pixel, a.

23 uint32_t packed;	23 // There is no pixel to the left of the first pixel. It's encoded directly.

24 memcpy(&packed, p, bpp);	24 // That works with our main loop if we just say that left pixel was zero.

25 return _mm_cvtsi32_si128(packed);	25 __m128i a, d = _mm_setzero_si128();

26 }	26

27	27 int rb = row_info->rowbytes;

28 template <int bpp>	28 while (rb > 0) {

29 static void store(void* p, __m128i v) {	29 a = d; memcpy(&d, row, 3);
	mtklein 2016/02/15 20:16:32 Seems worth keeping load() and store() for readabi Seems worth keeping load() and store() for readability, even if they're all implemented with memcpy: static __m128i load3(const void) { ... } static __m128i load4(const void) { ... } static void store3(void, __m128i) { ... } static void store4(void, __m128i) { ... } msarett 2016/02/16 13:42:35 Done. Show quoted text On 2016/02/15 20:16:32, mtklein wrote: > Seems worth keeping load() and store() for readability, even if they're all > implemented with memcpy: > > static __m128i load3(const void) { ... } > static __m128i load4(const void) { ... } > > static void store3(void, __m128i) { ... } > static void store4(void, __m128i) { ... } Done.
30 static_assert(bpp <= 4, "");	30 d = _mm_add_epi8(d, a);

31	31 memcpy(row, &d, 3);

32 uint32_t packed = _mm_cvtsi128_si32(v);	32

33 memcpy(p, &packed, bpp);	33 row += 3;

34 }	34 rb -= 3;

35	35 }

36 template <int bpp>	36 }

37 static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* ) {	37

38 // The Sub filter predicts each pixel as the previous pixel, a.	38 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row,

39 // There is no pixel to the left of the first pixel. It's encoded direc tly.	39 const uint8_t* prev)

40 // That works with our main loop if we just say that left pixel was zero .	40 {

41 __m128i a, d = _mm_setzero_si128();	41 // The Sub filter predicts each pixel as the previous pixel, a.

42	42 // There is no pixel to the left of the first pixel. It's encoded directly.

43 int rb = row_info->rowbytes;	43 // That works with our main loop if we just say that left pixel was zero.

44 while (rb > 0) {	44 __m128i a, d = _mm_setzero_si128();

45 a = d; d = load<bpp>(row);	45

46 d = _mm_add_epi8(d, a);	46 int rb = row_info->rowbytes;

47 store<bpp>(row, d);	47 while (rb > 0) {

48	48 a = d; memcpy(&d, row, 4);

49 row += bpp;	49 d = _mm_add_epi8(d, a);

50 rb -= bpp;	50 memcpy(row, &d, 4);

51 }	51

52 }	52 row += 4;

53	53 rb -= 4;

54 template <int bpp>	54 }

55 void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {	55 }

56 // The Avg filter predicts each pixel as the (truncated) average of a an d b.	56

57 // There's no pixel to the left of the first pixel. Luckily, it's	57 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row,

58 // predicted to be half of the pixel above it. So again, this works	58 const uint8_t* prev)

59 // perfectly with our loop if we make sure a starts at zero.	59 {

60 const __m128i zero = _mm_setzero_si128();	60 // The Avg filter predicts each pixel as the (truncated) average of a and b.

61 __m128i b;	61 // There's no pixel to the left of the first pixel. Luckily, it's

62 __m128i a, d = zero;	62 // predicted to be half of the pixel above it. So again, this works

63	63 // perfectly with our loop if we make sure a starts at zero.

64 int rb = row_info->rowbytes;	64 const __m128i zero = _mm_setzero_si128();

65 while (rb > 0) {	65 __m128i b;

66 b = load<bpp>(prev);	66 __m128i a, d = zero;

67 a = d; d = load<bpp>(row );	67

68	68 int rb = row_info->rowbytes;

69 // PNG requires a truncating average here, so sadly we can't just us e _mm_avg_epu8...	69 while (rb > 0) {

70 __m128i avg = _mm_avg_epu8(a,b);	70 memcpy(&b, prev, 3);

71 // ...but we can fix it up by subtracting off 1 if it rounded up.	71 a = d; memcpy(&d, row, 3);

72 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e pi8(1)));	72

73	73 // PNG requires a truncating average here, so sadly we can't just use

74 d = _mm_add_epi8(d, avg);	74 // _mm_avg_epu8...

75 store<bpp>(row, d);	75 __m128i avg = _mm_avg_epu8(a,b);

76	76 // ...but we can fix it up by subtracting off 1 if it rounded up.

77 prev += bpp;	77 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

78 row += bpp;	78 _mm_set1_epi8(1)));

79 rb -= bpp;	79

80 }	80 d = _mm_add_epi8(d, avg);

81 }	81 memcpy(row, &d, 3);

82	82

83 // Returns \|x\| for 16-bit lanes.	83 prev += 3;

84 static __m128i abs_i16(__m128i x) {	84 row += 3;

85 #if defined(__SSSE3__)	85 rb -= 3;

86 return _mm_abs_epi16(x);	86 }

87 #else	87 }

88 // Read this all as, return x<0 ? -x : x.	88 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row,

89 // To negate two's complement, you flip all the bits then add 1.	89 const uint8_t* prev)

90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());	90 {

91 x = _mm_xor_si128(x, is_negative); // Flip negative lanes.	91 // The Avg filter predicts each pixel as the (truncated) average of a and b.

92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ e lanes, else +0.	92 // There's no pixel to the left of the first pixel. Luckily, it's

93 return x;	93 // predicted to be half of the pixel above it. So again, this works

94 #endif	94 // perfectly with our loop if we make sure a starts at zero.

95 }	95 const __m128i zero = _mm_setzero_si128();

96	96 __m128i b;

97 // Bytewise c ? t : e.	97 __m128i a, d = zero;

98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {	98

99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling.	99 int rb = row_info->rowbytes;

100 return _mm_blendv_epi8(e,t,c);	100 while (rb > 0) {

101 #else	101 memcpy(&b, prev, 4);

102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));	102 a = d; memcpy(&d, row, 4);

103 #endif	103

104 }	104 // PNG requires a truncating average here, so sadly we can't just use

105	105 // _mm_avg_epu8...

106 template <int bpp>	106 __m128i avg = _mm_avg_epu8(a,b);

107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) {	107 // ...but we can fix it up by subtracting off 1 if it rounded up.

108 // Paeth tries to predict pixel d using the pixel to the left of it, a,	108 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

109 // and two pixels from the previous row, b and c:	109 _mm_set1_epi8(1)));

110 // prev: c b	110

111 // row: a d	111 d = _mm_add_epi8(d, avg);

112 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c.	112 memcpy(row, &d, 4);

113	113

114 // The first pixel has no left context, and so uses an Up filter, p = b.	114 prev += 4;

115 // This works naturally with our main loop's p = a+b-c if we force a and c to zero.	115 row += 4;

116 // Here we zero b and d, which become c and a respectively at the start of the loop.	116 rb -= 4;

117 const __m128i zero = _mm_setzero_si128();	117 }

118 __m128i c, b = zero,	118 }

119 a, d = zero;	119

120	120 // Returns \|x\| for 16-bit lanes.

121 int rb = row_info->rowbytes;	121 static __m128i abs_i16(__m128i x) {

122 while (rb > 0) {	122 #if defined(__SSSE3__)

123 // It's easiest to do this math (particularly, deal with pc) with 16 -bit intermediates.	123 return _mm_abs_epi16(x);

124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero);	124 #else

125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero);	125 // Read this all as, return x<0 ? -x : x.

126	126 // To negate two's complement, you flip all the bits then add 1.

127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c)	127 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());

128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c)	128 // Flip negative lanes.

129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c -c) == (b-c)+(a-c)	129 x = _mm_xor_si128(x, is_negative);

130	130 // +1 to negative lanes, else +0.

131 pa = abs_i16(pa); // \|p-a\|	131 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));

132 pb = abs_i16(pb); // \|p-b\|	132 return x;

133 pc = abs_i16(pc); // \|p-c\|

134

135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

136

137 // Paeth breaks ties favoring a over b over c.

138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

140 c));

141

142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255.

143 store<bpp>(row, _mm_packus_epi16(d,d));

144

145 prev += bpp;

146 row += bpp;

147 rb -= bpp;

148 }

149 }

150

151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

152 sk_sub_sse2<3>(row_info, row, prev);

153 }

154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

155 sk_sub_sse2<4>(row_info, row, prev);

156 }

157

158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

159 sk_avg_sse2<3>(row_info, row, prev);

160 }

161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {

162 sk_avg_sse2<4>(row_info, row, prev);

163 }

164

165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

166 sk_paeth_sse2<3>(row_info, row, prev);

167 }

168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) {

169 sk_paeth_sse2<4>(row_info, row, prev);

170 }

171

172 #endif	133 #endif

	134 }

	135

	136 // Bytewise c ? t : e.

	137 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {

	138 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before ena bling.
	msarett 2016/02/15 19:50:20 I dropped the SSE4 code for libpng patch. I dropped the SSE4 code for libpng patch. mtklein 2016/02/15 20:27:33 Let's set up a bot to test this tomorrow. It'd ac Show quoted text On 2016/02/15 19:50:20, msarett wrote: > I dropped the SSE4 code for libpng patch. Let's set up a bot to test this tomorrow. It'd actually be somewhat of a shame to drop this for Android, given that they will be building with -march=native, and new mobile x86 chips do support SSE 4.1. msarett 2016/02/16 13:42:35 Dropping the #if 0 in order to actually run this c Show quoted text On 2016/02/15 20:27:33, mtklein wrote: > On 2016/02/15 19:50:20, msarett wrote: > > I dropped the SSE4 code for libpng patch. > > Let's set up a bot to test this tomorrow. > > It'd actually be somewhat of a shame to drop this for Android, given that they > will be building with -march=native, and new mobile x86 chips do support SSE > 4.1. Dropping the #if 0 in order to actually run this code. Let's land it this way, if we are confident that we can get a bot set-up today.
	139 return _mm_blendv_epi8(e,t,c);

	140 #else

	141 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));

	142 #endif

	143 }

	144

	145 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row,

	146 const uint8_t* prev)

	147 {

	148 // Paeth tries to predict pixel d using the pixel to the left of it, a,

	149 // and two pixels from the previous row, b and c:

	150 // prev: c b

	151 // row: a d

	152 // The Paeth function predicts d to be whichever of a, b, or c is nearest to

	153 // p=a+b-c. The first pixel has no left context, and so uses an Up filter,

	154 // p = b. This works naturally with our main loop's p = a+b-c if we force a

	155 // and c to zero. Here we zero b and d, which become c and a respectively

	156 // at the start of the loop.

	157 const __m128i zero = _mm_setzero_si128();

	158 __m128i c, b = zero,

	159 a, d = zero;

	160

	161 int rb = row_info->rowbytes;

	162 while (rb > 0) {

	163 // It's easiest to do this math (particularly, deal with pc) with 16-bit

	164 // intermediates.

	165 memcpy(&b, prev, 3);

	166 memcpy(&d, row, 3);

	167 c = b; b = _mm_unpacklo_epi8(b, zero);

	168 a = d; d = _mm_unpacklo_epi8(d, zero);

	169 __m128i pa = _mm_sub_epi16(b,c),

	170 // (p-a) == (a+b-c - a) == (b-c)

	171 pb = _mm_sub_epi16(a,c),

	172 // (p-b) == (a+b-c - b) == (a-c)

	173 pc = _mm_add_epi16(pa,pb);

	174 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

	175

	176 pa = abs_i16(pa);// \|p-a\|

	177 pb = abs_i16(pb);// \|p-b\|

	178 pc = abs_i16(pc);// \|p-c\|

	179

	180 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	181

	182 // Paeth breaks ties favoring a over b over c.

	183 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	184 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	185 c));

	186

	187 // Note `_epi8`: we need addition to wrap modulo 255.

	188 d = _mm_add_epi8(d, nearest);

	189 __m128i r = _mm_packus_epi16(d,d);

	190 memcpy(row, &r, 3);

	191 prev += 3;

	192 row += 3;

	193 rb -= 3;

	194 }

	195 }

	196

	197 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row,

	198 const uint8_t* prev)

	199 {

	200 // Paeth tries to predict pixel d using the pixel to the left of it, a,

	201 // and two pixels from the previous row, b and c:

	202 // prev: c b

	203 // row: a d

	204 // The Paeth function predicts d to be whichever of a, b, or c is nearest to

	205 // p=a+b-c. The first pixel has no left context, and so uses an Up filter,

	206 // p = b. This works naturally with our main loop's p = a+b-c if we force a

	207 // and c to zero. Here we zero b and d, which become c and a respectively

	208 // at the start of the loop.

	209 const __m128i zero = _mm_setzero_si128();

	210 __m128i c, b = zero,

	211 a, d = zero;

	212

	213 int rb = row_info->rowbytes;

	214 while (rb > 0) {

	215 // It's easiest to do this math (particularly, deal with pc) with 16-bit

	216 // intermediates.

	217 memcpy(&b, prev, 4);

	218 memcpy(&d, row, 4);

	219 c = b; b = _mm_unpacklo_epi8(b, zero);

	220 a = d; d = _mm_unpacklo_epi8(d, zero);

	221 __m128i pa = _mm_sub_epi16(b,c),

	222 // (p-a) == (a+b-c - a) == (b-c)

	223 pb = _mm_sub_epi16(a,c),

	224 // (p-b) == (a+b-c - b) == (a-c)

	225 pc = _mm_add_epi16(pa,pb);

	226 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

	227

	228 pa = abs_i16(pa);// \|p-a\|

	229 pb = abs_i16(pb);// \|p-b\|

	230 pc = abs_i16(pc);// \|p-c\|

	231

	232 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	233

	234 // Paeth breaks ties favoring a over b over c.

	235 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	236 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	237 c));

	238

	239 // Note `_epi8`: we need addition to wrap modulo 255.

	240 d = _mm_add_epi8(d, nearest);

	241 __m128i r = _mm_packus_epi16(d,d);

	242 memcpy(row, &r, 4);

	243 prev += 4;

	244 row += 4;

	245 rb -= 4;

	246 }

	247 }

	248

	249 #endif

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »