OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkPngFilters.h" | 8 #include "SkPngFilters.h" |
9 | 9 |
10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d
). | 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d
). |
11 // They're positioned like this: | 11 // They're positioned like this: |
12 // prev: c b | 12 // prev: c b |
13 // row: a d | 13 // row: a d |
14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which
ever | 14 // The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be which
ever |
15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) | 15 // of a, b, or c is closest to p=a+b-c. (Up also exists, predicting d=b.) |
16 | 16 |
17 #if defined(__SSE2__) | 17 #if defined(__SSE2__) |
18 | 18 |
19 template <int bpp> | 19 static __m128i load3(const void* p) { |
20 static __m128i load(const void* p) { | |
21 static_assert(bpp <= 4, ""); | |
22 | |
23 uint32_t packed; | 20 uint32_t packed; |
24 memcpy(&packed, p, bpp); | 21 memcpy(&packed, p, 3); |
25 return _mm_cvtsi32_si128(packed); | 22 return _mm_cvtsi32_si128(packed); |
26 } | 23 } |
27 | 24 |
28 template <int bpp> | 25 static __m128i load4(const void* p) { |
29 static void store(void* p, __m128i v) { | 26 return _mm_cvtsi32_si128(*(const int*)p); |
30 static_assert(bpp <= 4, ""); | |
31 | |
32 uint32_t packed = _mm_cvtsi128_si32(v); | |
33 memcpy(p, &packed, bpp); | |
34 } | 27 } |
35 | 28 |
36 template <int bpp> | 29 static void store3(void* p, __m128i v) { |
37 static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t*
) { | 30 uint32_t packed = _mm_cvtsi128_si32(v); |
| 31 memcpy(p, &packed, 3); |
| 32 } |
| 33 |
| 34 static void store4(void* p, __m128i v) { |
| 35 *(int*)p = _mm_cvtsi128_si32(v); |
| 36 } |
| 37 |
| 38 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
38 // The Sub filter predicts each pixel as the previous pixel, a. | 39 // The Sub filter predicts each pixel as the previous pixel, a. |
39 // There is no pixel to the left of the first pixel. It's encoded direc
tly. | 40 // There is no pixel to the left of the first pixel. It's encoded direc
tly. |
40 // That works with our main loop if we just say that left pixel was zero
. | 41 // That works with our main loop if we just say that left pixel was zero
. |
| 42 __m128i a, d = _mm_setzero_si128(); |
| 43 |
| 44 int rb = row_info->rowbytes; |
| 45 while (rb > 0) { |
| 46 a = d; d = load3(row); |
| 47 d = _mm_add_epi8(d, a); |
| 48 store3(row, d); |
| 49 |
| 50 row += 3; |
| 51 rb -= 3; |
| 52 } |
| 53 } |
| 54 |
| 55 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
| 56 // The Sub filter predicts each pixel as the previous pixel, a. |
| 57 // There is no pixel to the left of the first pixel. It's encoded direc
tly. |
| 58 // That works with our main loop if we just say that left pixel was zero
. |
41 __m128i a, d = _mm_setzero_si128(); | 59 __m128i a, d = _mm_setzero_si128(); |
42 | 60 |
43 int rb = row_info->rowbytes; | 61 int rb = row_info->rowbytes; |
44 while (rb > 0) { | 62 while (rb > 0) { |
45 a = d; d = load<bpp>(row); | 63 a = d; d = load4(row); |
46 d = _mm_add_epi8(d, a); | 64 d = _mm_add_epi8(d, a); |
47 store<bpp>(row, d); | 65 store4(row, d); |
48 | 66 |
49 row += bpp; | 67 row += 4; |
50 rb -= bpp; | 68 rb -= 4; |
51 } | 69 } |
52 } | 70 } |
53 | 71 |
54 template <int bpp> | 72 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
55 void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | |
56 // The Avg filter predicts each pixel as the (truncated) average of a an
d b. | 73 // The Avg filter predicts each pixel as the (truncated) average of a an
d b. |
57 // There's no pixel to the left of the first pixel. Luckily, it's | 74 // There's no pixel to the left of the first pixel. Luckily, it's |
58 // predicted to be half of the pixel above it. So again, this works | 75 // predicted to be half of the pixel above it. So again, this works |
| 76 // perfectly with our loop if we make sure a starts at zero. |
| 77 const __m128i zero = _mm_setzero_si128(); |
| 78 __m128i b; |
| 79 __m128i a, d = zero; |
| 80 |
| 81 int rb = row_info->rowbytes; |
| 82 while (rb > 0) { |
| 83 b = load3(prev); |
| 84 a = d; d = load3(row ); |
| 85 |
| 86 // PNG requires a truncating average here, so sadly we can't just us
e _mm_avg_epu8... |
| 87 __m128i avg = _mm_avg_epu8(a,b); |
| 88 // ...but we can fix it up by subtracting off 1 if it rounded up. |
| 89 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e
pi8(1))); |
| 90 |
| 91 d = _mm_add_epi8(d, avg); |
| 92 store3(row, d); |
| 93 |
| 94 prev += 3; |
| 95 row += 3; |
| 96 rb -= 3; |
| 97 } |
| 98 } |
| 99 |
| 100 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
| 101 // The Avg filter predicts each pixel as the (truncated) average of a an
d b. |
| 102 // There's no pixel to the left of the first pixel. Luckily, it's |
| 103 // predicted to be half of the pixel above it. So again, this works |
59 // perfectly with our loop if we make sure a starts at zero. | 104 // perfectly with our loop if we make sure a starts at zero. |
60 const __m128i zero = _mm_setzero_si128(); | 105 const __m128i zero = _mm_setzero_si128(); |
61 __m128i b; | 106 __m128i b; |
62 __m128i a, d = zero; | 107 __m128i a, d = zero; |
63 | 108 |
64 int rb = row_info->rowbytes; | 109 int rb = row_info->rowbytes; |
65 while (rb > 0) { | 110 while (rb > 0) { |
66 b = load<bpp>(prev); | 111 b = load4(prev); |
67 a = d; d = load<bpp>(row ); | 112 a = d; d = load4(row ); |
68 | 113 |
69 // PNG requires a truncating average here, so sadly we can't just us
e _mm_avg_epu8... | 114 // PNG requires a truncating average here, so sadly we can't just us
e _mm_avg_epu8... |
70 __m128i avg = _mm_avg_epu8(a,b); | 115 __m128i avg = _mm_avg_epu8(a,b); |
71 // ...but we can fix it up by subtracting off 1 if it rounded up. | 116 // ...but we can fix it up by subtracting off 1 if it rounded up. |
72 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e
pi8(1))); | 117 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_e
pi8(1))); |
73 | 118 |
74 d = _mm_add_epi8(d, avg); | 119 d = _mm_add_epi8(d, avg); |
75 store<bpp>(row, d); | 120 store4(row, d); |
76 | 121 |
77 prev += bpp; | 122 prev += 4; |
78 row += bpp; | 123 row += 4; |
79 rb -= bpp; | 124 rb -= 4; |
80 } | 125 } |
81 } | 126 } |
82 | 127 |
83 // Returns |x| for 16-bit lanes. | 128 // Returns |x| for 16-bit lanes. |
84 static __m128i abs_i16(__m128i x) { | 129 static __m128i abs_i16(__m128i x) { |
85 #if defined(__SSSE3__) | 130 #if defined(__SSSE3__) |
86 return _mm_abs_epi16(x); | 131 return _mm_abs_epi16(x); |
87 #else | 132 #else |
88 // Read this all as, return x<0 ? -x : x. | 133 // Read this all as, return x<0 ? -x : x. |
89 // To negate two's complement, you flip all the bits then add 1. | 134 // To negate two's complement, you flip all the bits then add 1. |
90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); | 135 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); |
91 x = _mm_xor_si128(x, is_negative); // Flip negative
lanes. | 136 x = _mm_xor_si128(x, is_negative); // Flip negative
lanes. |
92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ
e lanes, else +0. | 137 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negative
lanes, else +0. |
93 return x; | 138 return x; |
94 #endif | 139 #endif |
95 } | 140 } |
96 | 141 |
97 // Bytewise c ? t : e. | 142 // Bytewise c ? t : e. |
98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | 143 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { |
99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before
enabling. | 144 #if defined(__SSE4_1__) |
100 return _mm_blendv_epi8(e,t,c); | 145 return _mm_blendv_epi8(e,t,c); |
101 #else | 146 #else |
102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | 147 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); |
103 #endif | 148 #endif |
104 } | 149 } |
105 | 150 |
106 template <int bpp> | 151 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { |
107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev
) { | |
108 // Paeth tries to predict pixel d using the pixel to the left of it, a, | 152 // Paeth tries to predict pixel d using the pixel to the left of it, a, |
109 // and two pixels from the previous row, b and c: | 153 // and two pixels from the previous row, b and c: |
110 // prev: c b | 154 // prev: c b |
111 // row: a d | 155 // row: a d |
112 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. | 156 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. |
113 | 157 |
114 // The first pixel has no left context, and so uses an Up filter, p = b. | 158 // The first pixel has no left context, and so uses an Up filter, p = b. |
115 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. | 159 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. |
116 // Here we zero b and d, which become c and a respectively at the start
of the loop. | 160 // Here we zero b and d, which become c and a respectively at the start
of the loop. |
117 const __m128i zero = _mm_setzero_si128(); | 161 const __m128i zero = _mm_setzero_si128(); |
118 __m128i c, b = zero, | 162 __m128i c, b = zero, |
119 a, d = zero; | 163 a, d = zero; |
120 | 164 |
121 int rb = row_info->rowbytes; | 165 int rb = row_info->rowbytes; |
122 while (rb > 0) { | 166 while (rb > 0) { |
123 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. | 167 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. |
124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); | 168 c = b; b = _mm_unpacklo_epi8(load3(prev), zero); |
125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); | 169 a = d; d = _mm_unpacklo_epi8(load3(row ), zero); |
126 | |
127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) | 170 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) |
128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) | 171 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) |
129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) | 172 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) |
130 | 173 |
131 pa = abs_i16(pa); // |p-a| | 174 pa = abs_i16(pa); // |p-a| |
132 pb = abs_i16(pb); // |p-b| | 175 pb = abs_i16(pb); // |p-b| |
133 pc = abs_i16(pc); // |p-c| | 176 pc = abs_i16(pc); // |p-c| |
134 | 177 |
135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); | 178 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
136 | 179 |
137 // Paeth breaks ties favoring a over b over c. | 180 // Paeth breaks ties favoring a over b over c. |
138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, | 181 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, | 182 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
140 c)); | 183 c)); |
141 | 184 |
142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. | 185 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. |
143 store<bpp>(row, _mm_packus_epi16(d,d)); | 186 store3(row, _mm_packus_epi16(d,d)); |
144 | 187 |
145 prev += bpp; | 188 prev += 3; |
146 row += bpp; | 189 row += 3; |
147 rb -= bpp; | 190 rb -= 3; |
148 } | 191 } |
149 } | 192 } |
150 | 193 |
151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 194 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { |
152 sk_sub_sse2<3>(row_info, row, prev); | 195 // Paeth tries to predict pixel d using the pixel to the left of it, a, |
153 } | 196 // and two pixels from the previous row, b and c: |
154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 197 // prev: c b |
155 sk_sub_sse2<4>(row_info, row, prev); | 198 // row: a d |
156 } | 199 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. |
157 | 200 |
158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 201 // The first pixel has no left context, and so uses an Up filter, p = b. |
159 sk_avg_sse2<3>(row_info, row, prev); | 202 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. |
160 } | 203 // Here we zero b and d, which become c and a respectively at the start
of the loop. |
161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 204 const __m128i zero = _mm_setzero_si128(); |
162 sk_avg_sse2<4>(row_info, row, prev); | 205 __m128i c, b = zero, |
163 } | 206 a, d = zero; |
164 | 207 |
165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | 208 int rb = row_info->rowbytes; |
166 sk_paeth_sse2<3>(row_info, row, prev); | 209 while (rb > 0) { |
167 } | 210 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. |
168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | 211 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); |
169 sk_paeth_sse2<4>(row_info, row, prev); | 212 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); |
| 213 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) |
| 214 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) |
| 215 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) |
| 216 |
| 217 pa = abs_i16(pa); // |p-a| |
| 218 pb = abs_i16(pb); // |p-b| |
| 219 pc = abs_i16(pc); // |p-c| |
| 220 |
| 221 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
| 222 |
| 223 // Paeth breaks ties favoring a over b over c. |
| 224 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
| 225 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
| 226 c)); |
| 227 |
| 228 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. |
| 229 store4(row, _mm_packus_epi16(d,d)); |
| 230 |
| 231 prev += 4; |
| 232 row += 4; |
| 233 rb -= 4; |
| 234 } |
170 } | 235 } |
171 | 236 |
172 #endif | 237 #endif |
OLD | NEW |