| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright 2016 Google Inc. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #include "SkPngFilters.h" | |
| 9 | |
| 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d
). | |
| 11 // They're positioned like this: | |
| 12 // prev: c b | |
| 13 // row: a d | |
| 14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which
ever | |
| 15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) | |
| 16 | |
| 17 #if defined(__SSE2__) | |
| 18 | |
| 19 static __m128i load3(const void* p) { | |
| 20 uint32_t packed; | |
| 21 memcpy(&packed, p, 3); | |
| 22 return _mm_cvtsi32_si128(packed); | |
| 23 } | |
| 24 | |
| 25 static __m128i load4(const void* p) { | |
| 26 return _mm_cvtsi32_si128(*(const int*)p); | |
| 27 } | |
| 28 | |
| 29 static void store3(void* p, __m128i v) { | |
| 30 uint32_t packed = _mm_cvtsi128_si32(v); | |
| 31 memcpy(p, &packed, 3); | |
| 32 } | |
| 33 | |
| 34 static void store4(void* p, __m128i v) { | |
| 35 *(int*)p = _mm_cvtsi128_si32(v); | |
| 36 } | |
| 37 | |
| 38 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | |
| 39 // The Sub filter predicts each pixel as the previous pixel, a. | |
| 40 // There is no pixel to the left of the first pixel. It's encoded direc
tly. | |
| 41 // That works with our main loop if we just say that left pixel was zero
. | |
| 42 __m128i a, d = _mm_setzero_si128(); | |
| 43 | |
| 44 int rb = row_info->rowbytes; | |
| 45 while (rb > 0) { | |
| 46 a = d; d = load3(row); | |
| 47 d = _mm_add_epi8(d, a); | |
| 48 store3(row, d); | |
| 49 | |
| 50 row += 3; | |
| 51 rb -= 3; | |
| 52 } | |
| 53 } | |
| 54 | |
| 55 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | |
| 56 // The Sub filter predicts each pixel as the previous pixel, a. | |
| 57 // There is no pixel to the left of the first pixel. It's encoded direc
tly. | |
| 58 // That works with our main loop if we just say that left pixel was zero
. | |
| 59 __m128i a, d = _mm_setzero_si128(); | |
| 60 | |
| 61 int rb = row_info->rowbytes; | |
| 62 while (rb > 0) { | |
| 63 a = d; d = load4(row); | |
| 64 d = _mm_add_epi8(d, a); | |
| 65 store4(row, d); | |
| 66 | |
| 67 row += 4; | |
| 68 rb -= 4; | |
| 69 } | |
| 70 } | |
| 71 | |
| 72 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | |
| 73 // The Avg filter predicts each pixel as the (truncated) average of a an
d b. | |
| 74 // There's no pixel to the left of the first pixel. Luckily, it's | |
| 75 // predicted to be half of the pixel above it. So again, this works | |
| 76 // perfectly with our loop if we make sure a starts at zero. | |
| 77 const __m128i zero = _mm_setzero_si128(); | |
| 78 __m128i b; | |
| 79 __m128i a, d = zero; | |
| 80 | |
| 81 int rb = row_info->rowbytes; | |
| 82 while (rb > 0) { | |
| 83 b = load3(prev); | |
| 84 a = d; d = load3(row ); | |
| 85 | |
| 86 // PNG requires a truncating average here, so sadly we can't just us
e _mm_avg_epu8... | |
| 87 __m128i avg = _mm_avg_epu8(a,b); | |
| 88 // ...but we can fix it up by subtracting off 1 if it rounded up. | |
| 89 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e
pi8(1))); | |
| 90 | |
| 91 d = _mm_add_epi8(d, avg); | |
| 92 store3(row, d); | |
| 93 | |
| 94 prev += 3; | |
| 95 row += 3; | |
| 96 rb -= 3; | |
| 97 } | |
| 98 } | |
| 99 | |
| 100 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | |
| 101 // The Avg filter predicts each pixel as the (truncated) average of a an
d b. | |
| 102 // There's no pixel to the left of the first pixel. Luckily, it's | |
| 103 // predicted to be half of the pixel above it. So again, this works | |
| 104 // perfectly with our loop if we make sure a starts at zero. | |
| 105 const __m128i zero = _mm_setzero_si128(); | |
| 106 __m128i b; | |
| 107 __m128i a, d = zero; | |
| 108 | |
| 109 int rb = row_info->rowbytes; | |
| 110 while (rb > 0) { | |
| 111 b = load4(prev); | |
| 112 a = d; d = load4(row ); | |
| 113 | |
| 114 // PNG requires a truncating average here, so sadly we can't just us
e _mm_avg_epu8... | |
| 115 __m128i avg = _mm_avg_epu8(a,b); | |
| 116 // ...but we can fix it up by subtracting off 1 if it rounded up. | |
| 117 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e
pi8(1))); | |
| 118 | |
| 119 d = _mm_add_epi8(d, avg); | |
| 120 store4(row, d); | |
| 121 | |
| 122 prev += 4; | |
| 123 row += 4; | |
| 124 rb -= 4; | |
| 125 } | |
| 126 } | |
| 127 | |
| 128 // Returns |x| for 16-bit lanes. | |
| 129 static __m128i abs_i16(__m128i x) { | |
| 130 #if defined(__SSSE3__) | |
| 131 return _mm_abs_epi16(x); | |
| 132 #else | |
| 133 // Read this all as, return x<0 ? -x : x. | |
| 134 // To negate two's complement, you flip all the bits then add 1. | |
| 135 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); | |
| 136 x = _mm_xor_si128(x, is_negative); // Flip negative
lanes. | |
| 137 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negative
lanes, else +0. | |
| 138 return x; | |
| 139 #endif | |
| 140 } | |
| 141 | |
| 142 // Bytewise c ? t : e. | |
| 143 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | |
| 144 #if defined(__SSE4_1__) | |
| 145 return _mm_blendv_epi8(e,t,c); | |
| 146 #else | |
| 147 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | |
| 148 #endif | |
| 149 } | |
| 150 | |
| 151 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | |
| 152 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
| 153 // and two pixels from the previous row, b and c: | |
| 154 // prev: c b | |
| 155 // row: a d | |
| 156 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. | |
| 157 | |
| 158 // The first pixel has no left context, and so uses an Up filter, p = b. | |
| 159 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. | |
| 160 // Here we zero b and d, which become c and a respectively at the start
of the loop. | |
| 161 const __m128i zero = _mm_setzero_si128(); | |
| 162 __m128i c, b = zero, | |
| 163 a, d = zero; | |
| 164 | |
| 165 int rb = row_info->rowbytes; | |
| 166 while (rb > 0) { | |
| 167 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. | |
| 168 c = b; b = _mm_unpacklo_epi8(load3(prev), zero); | |
| 169 a = d; d = _mm_unpacklo_epi8(load3(row ), zero); | |
| 170 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) | |
| 171 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) | |
| 172 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) | |
| 173 | |
| 174 pa = abs_i16(pa); // |p-a| | |
| 175 pb = abs_i16(pb); // |p-b| | |
| 176 pc = abs_i16(pc); // |p-c| | |
| 177 | |
| 178 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
| 179 | |
| 180 // Paeth breaks ties favoring a over b over c. | |
| 181 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
| 182 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
| 183 c)); | |
| 184 | |
| 185 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. | |
| 186 store3(row, _mm_packus_epi16(d,d)); | |
| 187 | |
| 188 prev += 3; | |
| 189 row += 3; | |
| 190 rb -= 3; | |
| 191 } | |
| 192 } | |
| 193 | |
| 194 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | |
| 195 // Paeth tries to predict pixel d using the pixel to the left of it, a, | |
| 196 // and two pixels from the previous row, b and c: | |
| 197 // prev: c b | |
| 198 // row: a d | |
| 199 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. | |
| 200 | |
| 201 // The first pixel has no left context, and so uses an Up filter, p = b. | |
| 202 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. | |
| 203 // Here we zero b and d, which become c and a respectively at the start
of the loop. | |
| 204 const __m128i zero = _mm_setzero_si128(); | |
| 205 __m128i c, b = zero, | |
| 206 a, d = zero; | |
| 207 | |
| 208 int rb = row_info->rowbytes; | |
| 209 while (rb > 0) { | |
| 210 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. | |
| 211 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); | |
| 212 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); | |
| 213 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) | |
| 214 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) | |
| 215 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) | |
| 216 | |
| 217 pa = abs_i16(pa); // |p-a| | |
| 218 pb = abs_i16(pb); // |p-b| | |
| 219 pc = abs_i16(pc); // |p-c| | |
| 220 | |
| 221 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | |
| 222 | |
| 223 // Paeth breaks ties favoring a over b over c. | |
| 224 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | |
| 225 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | |
| 226 c)); | |
| 227 | |
| 228 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. | |
| 229 store4(row, _mm_packus_epi16(d,d)); | |
| 230 | |
| 231 prev += 4; | |
| 232 row += 4; | |
| 233 rb -= 4; | |
| 234 } | |
| 235 } | |
| 236 | |
| 237 #endif | |
| OLD | NEW |