OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkPngFilters.h" | 8 #include "SkPngFilters.h" |
9 | 9 |
10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d
). | 10 // Functions in this file look at most 3 pixels (a,b,c) to predict the fourth (d
). |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
73 | 73 |
74 d = _mm_add_epi8(d, avg); | 74 d = _mm_add_epi8(d, avg); |
75 store<bpp>(row, d); | 75 store<bpp>(row, d); |
76 | 76 |
77 prev += bpp; | 77 prev += bpp; |
78 row += bpp; | 78 row += bpp; |
79 rb -= bpp; | 79 rb -= bpp; |
80 } | 80 } |
81 } | 81 } |
82 | 82 |
83 // Returns bytewise |x-y|. | 83 // Returns |x| for 16-bit lanes. |
84 static __m128i absdiff_u8(__m128i x, __m128i y) { | 84 static __m128i abs_i16(__m128i x) { |
85 // One of these two saturated subtractions will be the answer, the other
zero. | 85 #if defined(__SSSE3__) |
86 return _mm_or_si128(_mm_subs_epu8(x,y), _mm_subs_epu8(y,x)); | 86 return _mm_abs_epi16(x); |
| 87 #else |
| 88 // Read this all as, return x<0 ? -x : x. |
| 89 // To negate two's complement, you flip all the bits then add 1. |
| 90 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); |
| 91 x = _mm_xor_si128(x, is_negative); // Flip negative
lanes. |
| 92 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negativ
e lanes, else +0. |
| 93 return x; |
| 94 #endif |
87 } | 95 } |
88 | 96 |
89 // Bytewise c ? t : e. | 97 // Bytewise c ? t : e. |
90 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { | 98 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { |
91 // SSE 4.1+ would be: return _mm_blendv_epi8(e,t,c); | 99 #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before
enabling. |
| 100 return _mm_blendv_epi8(e,t,c); |
| 101 #else |
92 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); | 102 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); |
| 103 #endif |
93 } | 104 } |
94 | 105 |
95 template <int bpp> | 106 template <int bpp> |
96 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev
) { | 107 void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev
) { |
97 // Paeth tries to predict pixel d using the pixel to the left of it, a, | 108 // Paeth tries to predict pixel d using the pixel to the left of it, a, |
98 // and two pixels from the previous row, b and c: | 109 // and two pixels from the previous row, b and c: |
99 // prev: c b | 110 // prev: c b |
100 // row: a d | 111 // row: a d |
101 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. | 112 // The Paeth function predicts d to be whichever of a, b, or c is neares
t to p=a+b-c. |
102 | 113 |
103 // The first pixel has no left context, and so uses an Up filter, p = b. | 114 // The first pixel has no left context, and so uses an Up filter, p = b. |
104 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. | 115 // This works naturally with our main loop's p = a+b-c if we force a and
c to zero. |
105 // Here we zero b and d, which become c and a respectively at the start
of the loop. | 116 // Here we zero b and d, which become c and a respectively at the start
of the loop. |
106 __m128i c, b = _mm_setzero_si128(), | 117 const __m128i zero = _mm_setzero_si128(); |
107 a, d = _mm_setzero_si128(); | 118 __m128i c, b = zero, |
| 119 a, d = zero; |
108 | 120 |
109 int rb = row_info->rowbytes; | 121 int rb = row_info->rowbytes; |
110 while (rb > 0) { | 122 while (rb > 0) { |
111 c = b; b = load<bpp>(prev); | 123 // It's easiest to do this math (particularly, deal with pc) with 16
-bit intermediates. |
112 a = d; d = load<bpp>(row ); | 124 c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); |
| 125 a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); |
113 | 126 |
114 // We can't express p in 8 bits, but luckily we can use this faux p
instead. | 127 __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) |
115 // (I have no deep insight here... I just proved this with brute for
ce.) | 128 pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) |
116 __m128i min = _mm_min_epu8(a,b), | 129 pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c
-c) == (b-c)+(a-c) |
117 max = _mm_max_epu8(a,b), | |
118 faux_p = _mm_adds_epu8(min, _mm_subs_epu8(max, c)); | |
119 | 130 |
120 // We could use faux_p for calculating all three of pa, pb, and pc, | 131 pa = abs_i16(pa); // |p-a| |
121 // but it's a little quicker to calculate the correct pa and pb dire
ctly, | 132 pb = abs_i16(pb); // |p-b| |
122 // and the predictor remains the same. (Again, brute force.) | 133 pc = abs_i16(pc); // |p-c| |
123 __m128i pa = absdiff_u8(b,c), // |a+b-c - a| == |b-c| | |
124 pb = absdiff_u8(a,c), // |a+b-c - b| == |a-c| | |
125 faux_pc = absdiff_u8(faux_p, c); | |
126 | 134 |
127 // From here, things are straightforward. Find the smallest distanc
e to p... | 135 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); |
128 __m128i smallest = _mm_min_epu8(_mm_min_epu8(pa, pb), faux_pc); | |
129 | 136 |
130 // ... then the predictor is the input corresponding to that smalles
t distance, | 137 // Paeth breaks ties favoring a over b over c. |
131 // breaking ties in favor of a over b over c. | 138 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, |
132 __m128i nearest = if_then_else(_mm_cmpeq_epi8(smallest, pa), a, | 139 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, |
133 if_then_else(_mm_cmpeq_epi8(smallest, pb), b, | 140 c)); |
134 c)); | |
135 | 141 |
136 // We've reconstructed d! Leave it for next round to become a, and
write it out. | 142 d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to
wrap modulo 255. |
137 d = _mm_add_epi8(d, nearest); | 143 store<bpp>(row, _mm_packus_epi16(d,d)); |
138 store<bpp>(row, d); | |
139 | 144 |
140 prev += bpp; | 145 prev += bpp; |
141 row += bpp; | 146 row += bpp; |
142 rb -= bpp; | 147 rb -= bpp; |
143 } | 148 } |
144 } | 149 } |
145 | 150 |
146 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 151 void sk_sub3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
147 sk_sub_sse2<3>(row_info, row, prev); | 152 sk_sub_sse2<3>(row_info, row, prev); |
148 } | 153 } |
149 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 154 void sk_sub4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
150 sk_sub_sse2<4>(row_info, row, prev); | 155 sk_sub_sse2<4>(row_info, row, prev); |
151 } | 156 } |
152 | 157 |
153 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 158 void sk_avg3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
154 sk_avg_sse2<3>(row_info, row, prev); | 159 sk_avg_sse2<3>(row_info, row, prev); |
155 } | 160 } |
156 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ | 161 void sk_avg4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev)
{ |
157 sk_avg_sse2<4>(row_info, row, prev); | 162 sk_avg_sse2<4>(row_info, row, prev); |
158 } | 163 } |
159 | 164 |
160 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | 165 void sk_paeth3_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { |
161 sk_paeth_sse2<3>(row_info, row, prev); | 166 sk_paeth_sse2<3>(row_info, row, prev); |
162 } | 167 } |
163 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { | 168 void sk_paeth4_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* pre
v) { |
164 sk_paeth_sse2<4>(row_info, row, prev); | 169 sk_paeth_sse2<4>(row_info, row, prev); |
165 } | 170 } |
166 | 171 |
167 #endif | 172 #endif |
OLD | NEW |