OLD | NEW |
(Empty) | |
| 1 |
| 2 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions |
| 3 * |
| 4 * Copyright (c) 2016 Google, Inc. |
| 5 * Written by Mike Klein and Matt Sarett |
| 6 * Derived from arm/filter_neon_intrinsics.c, which was |
| 7 * Copyright (c) 2014,2016 Glenn Randers-Pehrson |
| 8 * |
| 9 * Last changed in libpng 1.6.22 [May 26, 2016] |
| 10 * |
| 11 * This code is released under the libpng license. |
| 12 * For conditions of distribution and use, see the disclaimer |
| 13 * and license in png.h |
| 14 */ |
| 15 |
| 16 #include "../../pngpriv.h" |
| 17 |
| 18 #ifdef PNG_READ_SUPPORTED |
| 19 |
| 20 #if PNG_INTEL_SSE_IMPLEMENTATION > 0 |
| 21 |
| 22 #include <immintrin.h> |
| 23 |
| 24 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). |
| 25 * They're positioned like this: |
| 26 * prev: c b |
| 27 * row: a d |
| 28 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be |
| 29 * whichever of a, b, or c is closest to p=a+b-c. |
| 30 */ |
| 31 |
| 32 static __m128i load4(const void* p) { |
| 33 return _mm_cvtsi32_si128(*(const int*)p); |
| 34 } |
| 35 |
| 36 static void store4(void* p, __m128i v) { |
| 37 *(int*)p = _mm_cvtsi128_si32(v); |
| 38 } |
| 39 |
| 40 static __m128i load3(const void* p) { |
| 41 /* We'll load 2 bytes, then 1 byte, |
| 42 * then mask them together, and finally load into SSE. |
| 43 */ |
| 44 const png_uint_16* p01 = p; |
| 45 const png_byte* p2 = (const png_byte*)(p01+1); |
| 46 |
| 47 png_uint_32 v012 = (png_uint_32)(*p01) |
| 48 | (png_uint_32)(*p2) << 16; |
| 49 return load4(&v012); |
| 50 } |
| 51 |
| 52 static void store3(void* p, __m128i v) { |
| 53 /* We'll pull from SSE as a 32-bit int, then write |
| 54 * its bottom two bytes, then its third byte. |
| 55 */ |
| 56 png_uint_32 v012; |
| 57 store4(&v012, v); |
| 58 |
| 59 png_uint_16* p01 = p; |
| 60 png_byte* p2 = (png_byte*)(p01+1); |
| 61 *p01 = v012; |
| 62 *p2 = v012 >> 16; |
| 63 } |
| 64 |
| 65 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, |
| 66 png_const_bytep prev) |
| 67 { |
| 68 /* The Sub filter predicts each pixel as the previous pixel, a. |
| 69 * There is no pixel to the left of the first pixel. It's encoded directly. |
| 70 * That works with our main loop if we just say that left pixel was zero. |
| 71 */ |
| 72 png_debug(1, "in png_read_filter_row_sub3_sse2"); |
| 73 __m128i a, d = _mm_setzero_si128(); |
| 74 |
| 75 int rb = row_info->rowbytes; |
| 76 while (rb >= 4) { |
| 77 a = d; d = load4(row); |
| 78 d = _mm_add_epi8(d, a); |
| 79 store3(row, d); |
| 80 |
| 81 row += 3; |
| 82 rb -= 3; |
| 83 } |
| 84 if (rb > 0) { |
| 85 a = d; d = load3(row); |
| 86 d = _mm_add_epi8(d, a); |
| 87 store3(row, d); |
| 88 |
| 89 row += 3; |
| 90 rb -= 3; |
| 91 } |
| 92 } |
| 93 |
| 94 void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, |
| 95 png_const_bytep prev) |
| 96 { |
| 97 /* The Sub filter predicts each pixel as the previous pixel, a. |
| 98 * There is no pixel to the left of the first pixel. It's encoded directly. |
| 99 * That works with our main loop if we just say that left pixel was zero. |
| 100 */ |
| 101 png_debug(1, "in png_read_filter_row_sub4_sse2"); |
| 102 __m128i a, d = _mm_setzero_si128(); |
| 103 |
| 104 int rb = row_info->rowbytes; |
| 105 while (rb > 0) { |
| 106 a = d; d = load4(row); |
| 107 d = _mm_add_epi8(d, a); |
| 108 store4(row, d); |
| 109 |
| 110 row += 4; |
| 111 rb -= 4; |
| 112 } |
| 113 } |
| 114 |
| 115 void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, |
| 116 png_const_bytep prev) |
| 117 { |
| 118 /* The Avg filter predicts each pixel as the (truncated) average of a and b. |
| 119 * There's no pixel to the left of the first pixel. Luckily, it's |
| 120 * predicted to be half of the pixel above it. So again, this works |
| 121 * perfectly with our loop if we make sure a starts at zero. |
| 122 */ |
| 123 png_debug(1, "in png_read_filter_row_avg3_sse2"); |
| 124 const __m128i zero = _mm_setzero_si128(); |
| 125 __m128i b; |
| 126 __m128i a, d = zero; |
| 127 |
| 128 int rb = row_info->rowbytes; |
| 129 while (rb >= 4) { |
| 130 b = load4(prev); |
| 131 a = d; d = load4(row ); |
| 132 |
| 133 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ |
| 134 __m128i avg = _mm_avg_epu8(a,b); |
| 135 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ |
| 136 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
| 137 _mm_set1_epi8(1))); |
| 138 d = _mm_add_epi8(d, avg); |
| 139 store3(row, d); |
| 140 |
| 141 prev += 3; |
| 142 row += 3; |
| 143 rb -= 3; |
| 144 } |
| 145 if (rb > 0) { |
| 146 b = load3(prev); |
| 147 a = d; d = load3(row ); |
| 148 |
| 149 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ |
| 150 __m128i avg = _mm_avg_epu8(a,b); |
| 151 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ |
| 152 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
| 153 _mm_set1_epi8(1))); |
| 154 |
| 155 d = _mm_add_epi8(d, avg); |
| 156 store3(row, d); |
| 157 |
| 158 prev += 3; |
| 159 row += 3; |
| 160 rb -= 3; |
| 161 } |
| 162 } |
| 163 |
| 164 void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, |
| 165 png_const_bytep prev) |
| 166 { |
| 167 /* The Avg filter predicts each pixel as the (truncated) average of a and b. |
| 168 * There's no pixel to the left of the first pixel. Luckily, it's |
| 169 * predicted to be half of the pixel above it. So again, this works |
| 170 * perfectly with our loop if we make sure a starts at zero. |
| 171 */ |
| 172 png_debug(1, "in png_read_filter_row_avg4_sse2"); |
| 173 const __m128i zero = _mm_setzero_si128(); |
| 174 __m128i b; |
| 175 __m128i a, d = zero; |
| 176 |
| 177 int rb = row_info->rowbytes; |
| 178 while (rb > 0) { |
| 179 b = load4(prev); |
| 180 a = d; d = load4(row ); |
| 181 |
| 182 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ |
| 183 __m128i avg = _mm_avg_epu8(a,b); |
| 184 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ |
| 185 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), |
| 186 _mm_set1_epi8(1))); |
| 187 |
| 188 d = _mm_add_epi8(d, avg); |
| 189 store4(row, d); |
| 190 |
| 191 prev += 4; |
| 192 row += 4; |
| 193 rb -= 4; |
| 194 } |
| 195 } |
| 196 |
| 197 /* Returns |x| for 16-bit lanes. */ |
| 198 static __m128i abs_i16(__m128i x) { |
| 199 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2 |
| 200 return _mm_abs_epi16(x); |
| 201 #else |
| 202 /* Read this all as, return x<0 ? -x : x. |
| 203 * To negate two's complement, you flip all the bits then add 1. |
| 204 */ |
| 205 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); |
| 206 |
| 207 /* Flip negative lanes. */ |
| 208 x = _mm_xor_si128(x, is_negative); |
| 209 |
| 210 /* +1 to negative lanes, else +0. */ |
| 211 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); |
| 212 return x; |
| 213 #endif |
| 214 } |
| 215 |
| 216 /* Bytewise c ? t : e. */ |
| 217 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { |
| 218 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3 |
| 219 return _mm_blendv_epi8(e,t,c); |
| 220 #else |
| 221 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); |
| 222 #endif |
| 223 } |
| 224 |
| 225 void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, |
| 226 png_const_bytep prev) |
| 227 { |
| 228 /* Paeth tries to predict pixel d using the pixel to the left of it, a, |
| 229 * and two pixels from the previous row, b and c: |
| 230 * prev: c b |
| 231 * row: a d |
| 232 * The Paeth function predicts d to be whichever of a, b, or c is nearest to |
| 233 * p=a+b-c. |
| 234 * |
| 235 * The first pixel has no left context, and so uses an Up filter, p = b. |
| 236 * This works naturally with our main loop's p = a+b-c if we force a and c |
| 237 * to zero. |
| 238 * Here we zero b and d, which become c and a respectively at the start of |
| 239 * the loop. |
| 240 */ |
| 241 png_debug(1, "in png_read_filter_row_paeth3_sse2"); |
| 242 const __m128i zero = _mm_setzero_si128(); |
| 243 __m128i c, b = zero, |
| 244 a, d = zero; |
| 245 |
| 246 int rb = row_info->rowbytes; |
| 247 while (rb >= 4) { |
| 248 /* It's easiest to do this math (particularly, deal with pc) with 16-bit |
| 249 * intermediates. |
| 250 */ |
| 251 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); |
| 252 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); |
| 253 |
| 254 /* (p-a) == (a+b-c - a) == (b-c) */ |
| 255 __m128i pa = _mm_sub_epi16(b,c); |
| 256 |
| 257 /* (p-b) == (a+b-c - b) == (a-c) */ |
| 258 __m128i pb = _mm_sub_epi16(a,c); |
| 259 |
| 260 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ |
| 261 __m128i pc = _mm_add_epi16(pa,pb); |
| 262 |
| 263 pa = abs_i16(pa); /* |p-a| */ |
| 264 pb = abs_i16(pb); /* |p-b| */ |
| 265 pc = abs_i16(pc); /* |p-c| */ |
| 266 |
| 267 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
| 268 |
| 269 /* Paeth breaks ties favoring a over b over c. */ |
| 270 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
| 271 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
| 272 c)); |
| 273 |
| 274 /* Note `_epi8`: we need addition to wrap modulo 255. */ |
| 275 d = _mm_add_epi8(d, nearest); |
| 276 store3(row, _mm_packus_epi16(d,d)); |
| 277 |
| 278 prev += 3; |
| 279 row += 3; |
| 280 rb -= 3; |
| 281 } |
| 282 if (rb > 0) { |
| 283 /* It's easiest to do this math (particularly, deal with pc) with 16-bit |
| 284 * intermediates. |
| 285 */ |
| 286 c = b; b = _mm_unpacklo_epi8(load3(prev), zero); |
| 287 a = d; d = _mm_unpacklo_epi8(load3(row ), zero); |
| 288 |
| 289 /* (p-a) == (a+b-c - a) == (b-c) */ |
| 290 __m128i pa = _mm_sub_epi16(b,c); |
| 291 |
| 292 /* (p-b) == (a+b-c - b) == (a-c) */ |
| 293 __m128i pb = _mm_sub_epi16(a,c); |
| 294 |
| 295 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ |
| 296 __m128i pc = _mm_add_epi16(pa,pb); |
| 297 |
| 298 pa = abs_i16(pa); /* |p-a| */ |
| 299 pb = abs_i16(pb); /* |p-b| */ |
| 300 pc = abs_i16(pc); /* |p-c| */ |
| 301 |
| 302 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
| 303 |
| 304 /* Paeth breaks ties favoring a over b over c. */ |
| 305 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
| 306 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
| 307 c)); |
| 308 |
| 309 /* Note `_epi8`: we need addition to wrap modulo 255. */ |
| 310 d = _mm_add_epi8(d, nearest); |
| 311 store3(row, _mm_packus_epi16(d,d)); |
| 312 |
| 313 prev += 3; |
| 314 row += 3; |
| 315 rb -= 3; |
| 316 } |
| 317 } |
| 318 |
| 319 void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, |
| 320 png_const_bytep prev) |
| 321 { |
| 322 /* Paeth tries to predict pixel d using the pixel to the left of it, a, |
| 323 * and two pixels from the previous row, b and c: |
| 324 * prev: c b |
| 325 * row: a d |
| 326 * The Paeth function predicts d to be whichever of a, b, or c is nearest to |
| 327 * p=a+b-c. |
| 328 * |
| 329 * The first pixel has no left context, and so uses an Up filter, p = b. |
| 330 * This works naturally with our main loop's p = a+b-c if we force a and c |
| 331 * to zero. |
| 332 * Here we zero b and d, which become c and a respectively at the start of |
| 333 * the loop. |
| 334 */ |
| 335 png_debug(1, "in png_read_filter_row_paeth4_sse2"); |
| 336 const __m128i zero = _mm_setzero_si128(); |
| 337 __m128i c, b = zero, |
| 338 a, d = zero; |
| 339 |
| 340 int rb = row_info->rowbytes; |
| 341 while (rb > 0) { |
| 342 /* It's easiest to do this math (particularly, deal with pc) with 16-bit |
| 343 * intermediates. |
| 344 */ |
| 345 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); |
| 346 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); |
| 347 |
| 348 /* (p-a) == (a+b-c - a) == (b-c) */ |
| 349 __m128i pa = _mm_sub_epi16(b,c); |
| 350 |
| 351 /* (p-b) == (a+b-c - b) == (a-c) */ |
| 352 __m128i pb = _mm_sub_epi16(a,c); |
| 353 |
| 354 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ |
| 355 __m128i pc = _mm_add_epi16(pa,pb); |
| 356 |
| 357 pa = abs_i16(pa); /* |p-a| */ |
| 358 pb = abs_i16(pb); /* |p-b| */ |
| 359 pc = abs_i16(pc); /* |p-c| */ |
| 360 |
| 361 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
| 362 |
| 363 /* Paeth breaks ties favoring a over b over c. */ |
| 364 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
| 365 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
| 366 c)); |
| 367 |
| 368 /* Note `_epi8`: we need addition to wrap modulo 255. */ |
| 369 d = _mm_add_epi8(d, nearest); |
| 370 store4(row, _mm_packus_epi16(d,d)); |
| 371 |
| 372 prev += 4; |
| 373 row += 4; |
| 374 rb -= 4; |
| 375 } |
| 376 } |
| 377 |
| 378 #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ |
| 379 #endif /* READ */ |
OLD | NEW |