Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkPngFilters.h" | 8 #include "SkPngFilters.h" |
| 9 | 9 |
| 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ). | 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d ). |
| 11 // They're positioned like this: | 11 // They're positioned like this: |
| 12 // prev: c b | 12 // prev: c b |
| 13 // row: a d | 13 // row: a d |
| 14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever | 14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which ever |
| 15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) | 15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) |
| 16 | 16 |
| 17 #if defined(__SSE2__) | 17 #if defined(__SSE2__) |
| 18 | 18 |
| 19 template <int bpp> | 19 static __m128i load3(const void* p) { |
| 20 static __m128i load(const void* p) { | 20 uint32_t packed; |
| 21 static_assert(bpp <= 4, ""); | 21 memcpy(&packed, p, 3); |
| 22 | 22 return _mm_cvtsi32_si128(packed); |
| 23 uint32_t packed; | 23 } |
| 24 memcpy(&packed, p, bpp); | 24 |
| 25 return _mm_cvtsi32_si128(packed); | 25 static __m128i load4(const void* p) { |
| 26 } | 26 uint32_t packed; |
| 27 | 27 memcpy(&packed, p, 4); |
| 28 template <int bpp> | 28 return _mm_cvtsi32_si128(packed); |
|
mtklein
2016/02/16 14:11:57
Now that we've split these apart, we might conside
| |
| 29 static void store(void* p, __m128i v) { | 29 } |
| 30 static_assert(bpp <= 4, ""); | 30 |
| 31 | 31 static void store3(void* p, __m128i v) { |
| 32 uint32_t packed = _mm_cvtsi128_si32(v); | 32 uint32_t packed = _mm_cvtsi128_si32(v); |
| 33 memcpy(p, &packed, bpp); | 33 memcpy(p, &packed, 3); |
| 34 } | 34 } |
| 35 | 35 |
| 36 template <int bpp> | 36 static void store4(void* p, __m128i v) { |
| 37 static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* ) { | 37 uint32_t packed = _mm_cvtsi128_si32(v); |
| 38 // The Sub filter predicts each pixel as the previous pixel, a. | 38 memcpy(p, &packed, 4); |
| 39 // There is no pixel to the left of the first pixel. It's encoded direc tly. | 39 } |
| 40 // That works with our main loop if we just say that left pixel was zero . | 40 |
| 41 __m128i a, d = _mm_setzero_si128(); | 41 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, |
|
mtklein
2016/02/16 14:11:57
Why do these guys go to two lines? Wouldn't it on
msarett
2016/02/16 14:48:59
Not sure. I copied the style of the signatures fr
| |
| 42 | 42 const uint8_t* prev) |
| 43 int rb = row_info->rowbytes; | 43 { |
| 44 while (rb > 0) { | 44 // The Sub filter predicts each pixel as the previous pixel, a. |
| 45 a = d; d = load<bpp>(row); | 45 // There is no pixel to the left of the first pixel. It's encoded directly. |
| 46 d = _mm_add_epi8(d, a); | 46 // That works with our main loop if we just say that left pixel was zero. |
| 47 store<bpp>(row, d); | 47 __m128i a, d = _mm_setzero_si128(); |
| 48 | 48 |
| 49 row += bpp; | 49 int rb = row_info->rowbytes; |
| 50 rb -= bpp; | 50 while (rb > 0) { |
| 51 } | 51 a = d; d = load3(row); |
| 52 } | 52 d = _mm_add_epi8(d, a); |
| 53 | 53 store3(row, d); |
| 54 template <int bpp> | 54 |
| 55 void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | 55 row += 3; |
| 56 // The Avg filter predicts each pixel as the (truncated) average of a an d b. | 56 rb -= 3; |
| 57 // There's no pixel to the left of the first pixel. Luckily, it's | 57 } |
| 58 // predicted to be half of the pixel above it. So again, this works | 58 } |
| 59 // perfectly with our loop if we make sure a starts at zero. | 59 |
| 60 const __m128i zero = _mm_setzero_si128(); | 60 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, |
| 61 __m128i b; | 61 const uint8_t* prev) |
| 62 __m128i a, d = zero; | 62 { |
| 63 | 63 // The Sub filter predicts each pixel as the previous pixel, a. |
| 64 int rb = row_info->rowbytes; | 64 // There is no pixel to the left of the first pixel. It's encoded directly. |
| 65 while (rb > 0) { | 65 // That works with our main loop if we just say that left pixel was zero. |
| 66 b = load<bpp>(prev); | 66 __m128i a, d = _mm_setzero_si128(); |
| 67 a = d; d = load<bpp>(row ); | 67 |
| 68 | 68 int rb = row_info->rowbytes; |
| 69 // PNG requires a truncating average here, so sadly we can't just us e _mm_avg_epu8... | 69 while (rb > 0) { |
| 70 __m128i avg = _mm_avg_epu8(a,b); | 70 a = d; d = load4(row); |
| 71 // ...but we can fix it up by subtracting off 1 if it rounded up. | 71 d = _mm_add_epi8(d, a); |
| 72 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e pi8(1))); | 72 store4(row, d); |
| 73 | 73 |
| 74 d = _mm_add_epi8(d, avg); | 74 row += 4; |
| 75 store<bpp>(row, d); | 75 rb -= 4; |
| 76 | 76 } |
| 77 prev += bpp; | 77 } |
| 78 row += bpp; | 78 |
| 79 rb -= bpp; | 79 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, |
| 80 } | 80 const uint8_t* prev) |
| 81 } | 81 { |
| 82 | 82 // The Avg filter predicts each pixel as the (truncated) average of a and b. |
| 83 // Returns |x| for 16-bit lanes. | 83 // There's no pixel to the left of the first pixel. Luckily, it's |
| 84 static __m128i abs_i16(__m128i x) { | 84 // predicted to be half of the pixel above it. So again, this works |
| 85 #if defined(__SSSE3__) | 85 // perfectly with our loop if we make sure a starts at zero. |
| 86 return _mm_abs_epi16(x); | 86 const __m128i zero = _mm_setzero_si128(); |
| 87 #else | 87 __m128i b; |
|
mtklein
2016/02/16 14:11:57
We might want to scoot b over a few columns right
msarett
2016/02/16 14:48:59
Done.
| |
| 88 // Read this all as, return x<0 ? -x : x. | 88 __m128i a, d = zero; |
| 89 // To negate two's complement, you flip all the bits then add 1. | 89 |
| 90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); | 90 int rb = row_info->rowbytes; |
| 91 x = _mm_xor_si128(x, is_negative); // Flip negative lanes. | 91 while (rb > 0) { |
| 92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ e lanes, else +0. | 92 b = load3(prev); |
| 93 return x; | 93 a = d; d = load3(row); |
| 94 #endif | 94 |
| 95 } | 95 // PNG requires a truncating average here, so sadly we can't just use |
| 96 | 96 // _mm_avg_epu8... |
| 97 // Bytewise c ? t : e. | 97 __m128i avg = _mm_avg_epu8(a,b); |
| 98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | 98 // ...but we can fix it up by subtracting off 1 if it rounded up. |
| 99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling. | 99 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
| 100 return _mm_blendv_epi8(e,t,c); | 100 _mm_set1_epi8(1))); |
| 101 #else | 101 |
| 102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | 102 d = _mm_add_epi8(d, avg); |
| 103 #endif | 103 store3(row, d); |
| 104 } | 104 |
| 105 | 105 prev += 3; |
| 106 template <int bpp> | 106 row += 3; |
| 107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev ) { | 107 rb -= 3; |
| 108 // Paeth tries to predict pixel d using the pixel to the left of it, a, | 108 } |
| 109 // and two pixels from the previous row, b and c: | 109 } |
| 110 // prev: c b | 110 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, |
| 111 // row: a d | 111 const uint8_t* prev) |
| 112 // The Paeth function predicts d to be whichever of a, b, or c is neares t to p=a+b-c. | 112 { |
| 113 | 113 // The Avg filter predicts each pixel as the (truncated) average of a and b. |
| 114 // The first pixel has no left context, and so uses an Up filter, p = b. | 114 // There's no pixel to the left of the first pixel. Luckily, it's |
| 115 // This works naturally with our main loop's p = a+b-c if we force a and c to zero. | 115 // predicted to be half of the pixel above it. So again, this works |
| 116 // Here we zero b and d, which become c and a respectively at the start of the loop. | 116 // perfectly with our loop if we make sure a starts at zero. |
| 117 const __m128i zero = _mm_setzero_si128(); | 117 const __m128i zero = _mm_setzero_si128(); |
| 118 __m128i c, b = zero, | 118 __m128i b; |
| 119 a, d = zero; | 119 __m128i a, d = zero; |
| 120 | 120 |
| 121 int rb = row_info->rowbytes; | 121 int rb = row_info->rowbytes; |
| 122 while (rb > 0) { | 122 while (rb > 0) { |
| 123 // It's easiest to do this math (particularly, deal with pc) with 16 -bit intermediates. | 123 b = load4(prev); |
| 124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); | 124 a = d; d = load4(row); |
| 125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); | 125 |
| 126 | 126 // PNG requires a truncating average here, so sadly we can't just use |
| 127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) | 127 // _mm_avg_epu8... |
| 128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) | 128 __m128i avg = _mm_avg_epu8(a,b); |
| 129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c -c) == (b-c)+(a-c) | 129 // ...but we can fix it up by subtracting off 1 if it rounded up. |
| 130 | 130 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
| 131 pa = abs_i16(pa); // |p-a| | 131 _mm_set1_epi8(1))); |
| 132 pb = abs_i16(pb); // |p-b| | 132 |
| 133 pc = abs_i16(pc); // |p-c| | 133 d = _mm_add_epi8(d, avg); |
| 134 | 134 store4(row, d); |
| 135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | 135 |
| 136 | 136 prev += 4; |
| 137 // Paeth breaks ties favoring a over b over c. | 137 row += 4; |
| 138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | 138 rb -= 4; |
| 139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | 139 } |
| 140 c)); | 140 } |
| 141 | 141 |
| 142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255. | 142 // Returns |x| for 16-bit lanes. |
| 143 store<bpp>(row, _mm_packus_epi16(d,d)); | 143 static __m128i abs_i16(__m128i x) { |
| 144 | 144 #if defined(__SSSE3__) |
| 145 prev += bpp; | 145 return _mm_abs_epi16(x); |
| 146 row += bpp; | 146 #else |
| 147 rb -= bpp; | 147 // Read this all as, return x<0 ? -x : x. |
| 148 } | 148 // To negate two's complement, you flip all the bits then add 1. |
| 149 } | 149 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); |
| 150 | 150 // Flip negative lanes. |
| 151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | 151 x = _mm_xor_si128(x, is_negative); |
| 152 sk_sub_sse2<3>(row_info, row, prev); | 152 // +1 to negative lanes, else +0. |
| 153 } | 153 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); |
| 154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | 154 return x; |
| 155 sk_sub_sse2<4>(row_info, row, prev); | |
| 156 } | |
| 157 | |
| 158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
| 159 sk_avg_sse2<3>(row_info, row, prev); | |
| 160 } | |
| 161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { | |
| 162 sk_avg_sse2<4>(row_info, row, prev); | |
| 163 } | |
| 164 | |
| 165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) { | |
| 166 sk_paeth_sse2<3>(row_info, row, prev); | |
| 167 } | |
| 168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre v) { | |
| 169 sk_paeth_sse2<4>(row_info, row, prev); | |
| 170 } | |
| 171 | |
| 172 #endif | 155 #endif |
| 156 } | |
| 157 | |
| 158 // Bytewise c ? t : e. | |
| 159 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | |
| 160 #if defined(__SSE4_1__) | |
| 161 return _mm_blendv_epi8(e,t,c); | |
| 162 #else | |
| 163 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | |
| 164 #endif | |
| 165 } | |
| 166 | |
| 167 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, | |
| 168 const uint8_t* prev) | |
| 169 { | |
| 170 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
| 171 // and two pixels from the previous row, b and c: | |
| 172 // prev: c b | |
| 173 // row: a d | |
| 174 // The Paeth function predicts d to be whichever of a, b, or c is nearest to | |
| 175 // p=a+b-c. The first pixel has no left context, and so uses an Up filter, | |
| 176 // p = b. This works naturally with our main loop's p = a+b-c if we force a | |
| 177 // and c to zero. Here we zero b and d, which become c and a respectively | |
| 178 // at the start of the loop. | |
| 179 const __m128i zero = _mm_setzero_si128(); | |
| 180 __m128i c, b = zero, | |
| 181 a, d = zero; | |
| 182 | |
| 183 int rb = row_info->rowbytes; | |
| 184 while (rb > 0) { | |
| 185 // It's easiest to do this math (particularly, deal with pc) with 16-bit | |
| 186 // intermediates. | |
| 187 b = load3(prev); | |
| 188 d = load3(row); | |
|
mtklein
2016/02/16 14:11:57
I think this breaks things by loading new values f
msarett
2016/02/16 14:48:59
Done.
| |
| 189 c = b; b = _mm_unpacklo_epi8(b, zero); | |
| 190 a = d; d = _mm_unpacklo_epi8(d, zero); | |
| 191 __m128i pa = _mm_sub_epi16(b,c), | |
| 192 // (p-a) == (a+b-c - a) == (b-c) | |
|
mtklein
2016/02/16 14:11:57
Moving these comments around and changing the alig
msarett
2016/02/16 14:48:59
Done.
| |
| 193 pb = _mm_sub_epi16(a,c), | |
| 194 // (p-b) == (a+b-c - b) == (a-c) | |
| 195 pc = _mm_add_epi16(pa,pb); | |
| 196 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) | |
| 197 | |
| 198 pa = abs_i16(pa);// |p-a| | |
| 199 pb = abs_i16(pb);// |p-b| | |
| 200 pc = abs_i16(pc);// |p-c| | |
| 201 | |
| 202 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
| 203 | |
| 204 // Paeth breaks ties favoring a over b over c. | |
| 205 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
| 206 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
|
mtklein
2016/02/16 14:11:57
I think this hurts readability to indent like this
msarett
2016/02/16 14:48:58
Done.
| |
| 207 c)); | |
| 208 | |
| 209 // Note `_epi8`: we need addition to wrap modulo 255. | |
| 210 d = _mm_add_epi8(d, nearest); | |
| 211 store3(row, _mm_packus_epi16(d,d)); | |
| 212 prev += 3; | |
| 213 row += 3; | |
| 214 rb -= 3; | |
| 215 } | |
| 216 } | |
| 217 | |
| 218 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, | |
| 219 const uint8_t* prev) | |
| 220 { | |
| 221 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
| 222 // and two pixels from the previous row, b and c: | |
| 223 // prev: c b | |
| 224 // row: a d | |
| 225 // The Paeth function predicts d to be whichever of a, b, or c is nearest to | |
| 226 // p=a+b-c. The first pixel has no left context, and so uses an Up filter, | |
| 227 // p = b. This works naturally with our main loop's p = a+b-c if we force a | |
| 228 // and c to zero. Here we zero b and d, which become c and a respectively | |
| 229 // at the start of the loop. | |
| 230 const __m128i zero = _mm_setzero_si128(); | |
| 231 __m128i c, b = zero, | |
| 232 a, d = zero; | |
| 233 | |
| 234 int rb = row_info->rowbytes; | |
| 235 while (rb > 0) { | |
| 236 // It's easiest to do this math (particularly, deal with pc) with 16-bit | |
| 237 // intermediates. | |
| 238 b = load4(prev); | |
| 239 d = load4(row); | |
| 240 c = b; b = _mm_unpacklo_epi8(b, zero); | |
| 241 a = d; d = _mm_unpacklo_epi8(d, zero); | |
| 242 __m128i pa = _mm_sub_epi16(b,c), | |
| 243 // (p-a) == (a+b-c - a) == (b-c) | |
| 244 pb = _mm_sub_epi16(a,c), | |
| 245 // (p-b) == (a+b-c - b) == (a-c) | |
| 246 pc = _mm_add_epi16(pa,pb); | |
| 247 // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) | |
| 248 | |
| 249 pa = abs_i16(pa);// |p-a| | |
| 250 pb = abs_i16(pb);// |p-b| | |
| 251 pc = abs_i16(pc);// |p-c| | |
| 252 | |
| 253 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
| 254 | |
| 255 // Paeth breaks ties favoring a over b over c. | |
| 256 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
| 257 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
| 258 c)); | |
| 259 | |
| 260 // Note `_epi8`: we need addition to wrap modulo 255. | |
| 261 d = _mm_add_epi8(d, nearest); | |
| 262 store4(row, _mm_packus_epi16(d,d)); | |
| 263 prev += 4; | |
| 264 row += 4; | |
| 265 rb -= 4; | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 #endif | |
| OLD | NEW |