OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkPngFilters.h" | 8 #include "SkPngFilters.h" |
9 | 9 |
10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ). | 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ). |
11 // They're positioned like this: | 11 // They're positioned like this: |
12 // prev: c b | 12 // prev: c b |
13 // row: a d | 13 // row: a d |
14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever | 14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever |
15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) | 15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) |
16 | 16 |
17 #if defined(__SSE2__) | 17 #if defined(__SSE2__) |
18 | 18 |
19 template <int bpp> | 19 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, |
20 static __m128i load(const void* p) { | 20 const uint8_t* prev) |
21 static_assert(bpp <= 4, ""); | 21 { |
22 | 22 // The Sub filter predicts each pixel as the previous pixel, a. |
23 uint32_t packed; | 23 // There is no pixel to the left of the first pixel. It's encoded directly. |
24 memcpy(&packed, p, bpp); | 24 // That works with our main loop if we just say that left pixel was zero. |
25 return _mm_cvtsi32_si128(packed); | 25 __m128i a, d = _mm_setzero_si128(); |
26 } | 26 |
27 | 27 int rb = row_info->rowbytes; |
28 template <int bpp> | 28 while (rb > 0) { |
29 static void store(void* p, __m128i v) { | 29 a = d; memcpy(&d, row, 3); |
mtklein
2016/02/15 20:16:32
Seems worth keeping load() and store() for readabi
msarett
2016/02/16 13:42:35
Done.
| |
30 static_assert(bpp <= 4, ""); | 30 d = _mm_add_epi8(d, a); |
31 | 31 memcpy(row, &d, 3); |
32 uint32_t packed = _mm_cvtsi128_si32(v); | 32 |
33 memcpy(p, &packed, bpp); | 33 row += 3; |
34 } | 34 rb -= 3; |
35 | 35 } |
36 template <int bpp> | 36 } |
37 static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* ) { | 37 |
38 // The Sub filter predicts each pixel as the previous pixel, a. | 38 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, |
39 // There is no pixel to the left of the first pixel. It's encoded direc tly. | 39 const uint8_t* prev) |
40 // That works with our main loop if we just say that left pixel was zero . | 40 { |
41 __m128i a, d = _mm_setzero_si128(); | 41 // The Sub filter predicts each pixel as the previous pixel, a. |
42 | 42 // There is no pixel to the left of the first pixel. It's encoded directly. |
43 int rb = row_info->rowbytes; | 43 // That works with our main loop if we just say that left pixel was zero. |
44 while (rb > 0) { | 44 __m128i a, d = _mm_setzero_si128(); |
45 a = d; d = load<bpp>(row); | 45 |
46 d = _mm_add_epi8(d, a); | 46 int rb = row_info->rowbytes; |
47 store<bpp>(row, d); | 47 while (rb > 0) { |
48 | 48 a = d; memcpy(&d, row, 4); |
49 row += bpp; | 49 d = _mm_add_epi8(d, a); |
50 rb -= bpp; | 50 memcpy(row, &d, 4); |
51 } | 51 |
52 } | 52 row += 4; |
53 | 53 rb -= 4; |
54 template <int bpp> | 54 } |
55 void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | 55 } |
56 // The Avg filter predicts each pixel as the (truncated) average of a an d b. | 56 |
57 // There's no pixel to the left of the first pixel. Luckily, it's | 57 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, |
58 // predicted to be half of the pixel above it. So again, this works | 58 const uint8_t* prev) |
59 // perfectly with our loop if we make sure a starts at zero. | 59 { |
60 const __m128i zero = _mm_setzero_si128(); | 60 // The Avg filter predicts each pixel as the (truncated) average of a and b. |
61 __m128i b; | 61 // There's no pixel to the left of the first pixel. Luckily, it's |
62 __m128i a, d = zero; | 62 // predicted to be half of the pixel above it. So again, this works |
63 | 63 // perfectly with our loop if we make sure a starts at zero. |
64 int rb = row_info->rowbytes; | 64 const __m128i zero = _mm_setzero_si128(); |
65 while (rb > 0) { | 65 __m128i b; |
66 b = load<bpp>(prev); | 66 __m128i a, d = zero; |
67 a = d; d = load<bpp>(row ); | 67 |
68 | 68 int rb = row_info->rowbytes; |
69 // PNG requires a truncating average here, so sadly we can't just us e _mm_avg_epu8... | 69 while (rb > 0) { |
70 __m128i avg = _mm_avg_epu8(a,b); | 70 memcpy(&b, prev, 3); |
71 // ...but we can fix it up by subtracting off 1 if it rounded up. | 71 a = d; memcpy(&d, row, 3); |
72 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e pi8(1))); | 72 |
73 | 73 // PNG requires a truncating average here, so sadly we can't just use |
74 d = _mm_add_epi8(d, avg); | 74 // _mm_avg_epu8... |
75 store<bpp>(row, d); | 75 __m128i avg = _mm_avg_epu8(a,b); |
76 | 76 // ...but we can fix it up by subtracting off 1 if it rounded up. |
77 prev += bpp; | 77 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
78 row += bpp; | 78 _mm_set1_epi8(1))); |
79 rb -= bpp; | 79 |
80 } | 80 d = _mm_add_epi8(d, avg); |
81 } | 81 memcpy(row, &d, 3); |
82 | 82 |
83 // Returns |x| for 16-bit lanes. | 83 prev += 3; |
84 static __m128i abs_i16(__m128i x) { | 84 row += 3; |
85 #if defined(__SSSE3__) | 85 rb -= 3; |
86 return _mm_abs_epi16(x); | 86 } |
87 #else | 87 } |
88 // Read this all as, return x<0 ? -x : x. | 88 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, |
89 // To negate two's complement, you flip all the bits then add 1. | 89 const uint8_t* prev) |
90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); | 90 { |
91 x = _mm_xor_si128(x, is_negative); // Flip negative lanes. | 91 // The Avg filter predicts each pixel as the (truncated) average of a and b. |
92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ e lanes, else +0. | 92 // There's no pixel to the left of the first pixel. Luckily, it's |
93 return x; | 93 // predicted to be half of the pixel above it. So again, this works |
94 #endif | 94 // perfectly with our loop if we make sure a starts at zero. |
95 } | 95 const __m128i zero = _mm_setzero_si128(); |
96 | 96 __m128i b; |
97 // Bytewise c ? t : e. | 97 __m128i a, d = zero; |
98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | 98 |
99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling. | 99 int rb = row_info->rowbytes; |
100 return _mm_blendv_epi8(e,t,c); | 100 while (rb > 0) { |
101 #else | 101 memcpy(&b, prev, 4); |
102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | 102 a = d; memcpy(&d, row, 4); |
103 #endif | 103 |
104 } | 104 // PNG requires a truncating average here, so sadly we can't just use |
105 | 105 // _mm_avg_epu8... |
106 template <int bpp> | 106 __m128i avg = _mm_avg_epu8(a,b); |
107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) { | 107 // ...but we can fix it up by subtracting off 1 if it rounded up. |
108 // Paeth tries to predict pixel d using the pixel to the left of it, a, | 108 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
109 // and two pixels from the previous row, b and c: | 109 _mm_set1_epi8(1))); |
110 // prev: c b | 110 |
111 // row: a d | 111 d = _mm_add_epi8(d, avg); |
112 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c. | 112 memcpy(row, &d, 4); |
113 | 113 |
114 // The first pixel has no left context, and so uses an Up filter, p = b. | 114 prev += 4; |
115 // This works naturally with our main loop's p = a+b-c if we force a and c to zero. | 115 row += 4; |
116 // Here we zero b and d, which become c and a respectively at the start of the loop. | 116 rb -= 4; |
117 const __m128i zero = _mm_setzero_si128(); | 117 } |
118 __m128i c, b = zero, | 118 } |
119 a, d = zero; | 119 |
120 | 120 // Returns |x| for 16-bit lanes. |
121 int rb = row_info->rowbytes; | 121 static __m128i abs_i16(__m128i x) { |
122 while (rb > 0) { | 122 #if defined(__SSSE3__) |
123 // It's easiest to do this math (particularly, deal with pc) with 16 -bit intermediates. | 123 return _mm_abs_epi16(x); |
124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); | 124 #else |
125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); | 125 // Read this all as, return x<0 ? -x : x. |
126 | 126 // To negate two's complement, you flip all the bits then add 1. |
127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) | 127 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); |
128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) | 128 // Flip negative lanes. |
129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c -c) == (b-c)+(a-c) | 129 x = _mm_xor_si128(x, is_negative); |
130 | 130 // +1 to negative lanes, else +0. |
131 pa = abs_i16(pa); // |p-a| | 131 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); |
132 pb = abs_i16(pb); // |p-b| | 132 return x; |
133 pc = abs_i16(pc); // |p-c| | |
134 | |
135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
136 | |
137 // Paeth breaks ties favoring a over b over c. | |
138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
140 c)); | |
141 | |
142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255. | |
143 store<bpp>(row, _mm_packus_epi16(d,d)); | |
144 | |
145 prev += bpp; | |
146 row += bpp; | |
147 rb -= bpp; | |
148 } | |
149 } | |
150 | |
151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
152 sk_sub_sse2<3>(row_info, row, prev); | |
153 } | |
154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
155 sk_sub_sse2<4>(row_info, row, prev); | |
156 } | |
157 | |
158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
159 sk_avg_sse2<3>(row_info, row, prev); | |
160 } | |
161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
162 sk_avg_sse2<4>(row_info, row, prev); | |
163 } | |
164 | |
165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) { | |
166 sk_paeth_sse2<3>(row_info, row, prev); | |
167 } | |
168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) { | |
169 sk_paeth_sse2<4>(row_info, row, prev); | |
170 } | |
171 | |
172 #endif | 133 #endif |
134 } | |
135 | |
136 // Bytewise c ? t : e. | |
137 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | |
138 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before ena bling. | |
msarett
2016/02/15 19:50:20
I dropped the SSE4 code for libpng patch.
mtklein
2016/02/15 20:27:33
Let's set up a bot to test this tomorrow.
It'd ac
msarett
2016/02/16 13:42:35
Dropping the #if 0 in order to actually run this c
| |
139 return _mm_blendv_epi8(e,t,c); | |
140 #else | |
141 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | |
142 #endif | |
143 } | |
144 | |
145 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, | |
146 const uint8_t* prev) | |
147 { | |
148 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
149 // and two pixels from the previous row, b and c: | |
150 // prev: c b | |
151 // row: a d | |
152 // The Paeth function predicts d to be whichever of a, b, or c is nearest to | |
153 // p=a+b-c. The first pixel has no left context, and so uses an Up filter, | |
154 // p = b. This works naturally with our main loop's p = a+b-c if we force a | |
155 // and c to zero. Here we zero b and d, which become c and a respectively | |
156 // at the start of the loop. | |
157 const __m128i zero = _mm_setzero_si128(); | |
158 __m128i c, b = zero, | |
159 a, d = zero; | |
160 | |
161 int rb = row_info->rowbytes; | |
162 while (rb > 0) { | |
163 // It's easiest to do this math (particularly, deal with pc) with 16-bit | |
164 // intermediates. | |
165 memcpy(&b, prev, 3); | |
166 memcpy(&d, row, 3); | |
167 c = b; b = _mm_unpacklo_epi8(b, zero); | |
168 a = d; d = _mm_unpacklo_epi8(d, zero); | |
169 __m128i pa = _mm_sub_epi16(b,c), | |
170 // (p-a) == (a+b-c - a) == (b-c) | |
171 pb = _mm_sub_epi16(a,c), | |
172 // (p-b) == (a+b-c - b) == (a-c) | |
173 pc = _mm_add_epi16(pa,pb); | |
174 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) | |
175 | |
176 pa = abs_i16(pa);// |p-a| | |
177 pb = abs_i16(pb);// |p-b| | |
178 pc = abs_i16(pc);// |p-c| | |
179 | |
180 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
181 | |
182 // Paeth breaks ties favoring a over b over c. | |
183 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
184 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
185 c)); | |
186 | |
187 // Note `_epi8`: we need addition to wrap modulo 255. | |
188 d = _mm_add_epi8(d, nearest); | |
189 __m128i r = _mm_packus_epi16(d,d); | |
190 memcpy(row, &r, 3); | |
191 prev += 3; | |
192 row += 3; | |
193 rb -= 3; | |
194 } | |
195 } | |
196 | |
197 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, | |
198 const uint8_t* prev) | |
199 { | |
200 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
201 // and two pixels from the previous row, b and c: | |
202 // prev: c b | |
203 // row: a d | |
204 // The Paeth function predicts d to be whichever of a, b, or c is nearest to | |
205 // p=a+b-c. The first pixel has no left context, and so uses an Up filter, | |
206 // p = b. This works naturally with our main loop's p = a+b-c if we force a | |
207 // and c to zero. Here we zero b and d, which become c and a respectively | |
208 // at the start of the loop. | |
209 const __m128i zero = _mm_setzero_si128(); | |
210 __m128i c, b = zero, | |
211 a, d = zero; | |
212 | |
213 int rb = row_info->rowbytes; | |
214 while (rb > 0) { | |
215 // It's easiest to do this math (particularly, deal with pc) with 16-bit | |
216 // intermediates. | |
217 memcpy(&b, prev, 4); | |
218 memcpy(&d, row, 4); | |
219 c = b; b = _mm_unpacklo_epi8(b, zero); | |
220 a = d; d = _mm_unpacklo_epi8(d, zero); | |
221 __m128i pa = _mm_sub_epi16(b,c), | |
222 // (p-a) == (a+b-c - a) == (b-c) | |
223 pb = _mm_sub_epi16(a,c), | |
224 // (p-b) == (a+b-c - b) == (a-c) | |
225 pc = _mm_add_epi16(pa,pb); | |
226 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) | |
227 | |
228 pa = abs_i16(pa);// |p-a| | |
229 pb = abs_i16(pb);// |p-b| | |
230 pc = abs_i16(pc);// |p-c| | |
231 | |
232 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
233 | |
234 // Paeth breaks ties favoring a over b over c. | |
235 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
236 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
237 c)); | |
238 | |
239 // Note `_epi8`: we need addition to wrap modulo 255. | |
240 d = _mm_add_epi8(d, nearest); | |
241 __m128i r = _mm_packus_epi16(d,d); | |
242 memcpy(row, &r, 4); | |
243 prev += 4; | |
244 row += 4; | |
245 rb -= 4; | |
246 } | |
247 } | |
248 | |
249 #endif | |
OLD | NEW |