OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
43 // Convolves horizontally along a single row. The row data is given in | 43 // Convolves horizontally along a single row. The row data is given in |
44 // |src_data| and continues for the num_values() of the filter. | 44 // |src_data| and continues for the num_values() of the filter. |
45 void convolveHorizontally_SSE2(const unsigned char* src_data, | 45 void convolveHorizontally_SSE2(const unsigned char* src_data, |
46 const SkConvolutionFilter1D& filter, | 46 const SkConvolutionFilter1D& filter, |
47 unsigned char* out_row, | 47 unsigned char* out_row, |
48 bool /*has_alpha*/) { | 48 bool /*has_alpha*/) { |
49 int num_values = filter.numValues(); | 49 int num_values = filter.numValues(); |
50 | 50 |
51 int filter_offset, filter_length; | 51 int filter_offset, filter_length; |
52 __m128i zero = _mm_setzero_si128(); | 52 __m128i zero = _mm_setzero_si128(); |
53 __m128i mask[4]; | |
54 // |mask| will be used to decimate all extra filter coefficients that are | |
55 // loaded by SIMD when |filter_length| is not divisible by 4. | |
56 // mask[0] is not used in following algorithm. | |
57 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
58 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
59 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
60 | 53 |
61 // Output one pixel each iteration, calculating all channels (RGBA) together . | 54 // Output one pixel each iteration, calculating all channels (RGBA) together . |
62 for (int out_x = 0; out_x < num_values; out_x++) { | 55 for (int out_x = 0; out_x < num_values; out_x++) { |
63 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 56 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
64 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 57 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
65 | 58 |
66 __m128i accum = _mm_setzero_si128(); | 59 __m128i accum = _mm_setzero_si128(); |
67 | 60 |
68 // Compute the first pixel in this row that the filter affects. It will | 61 // Compute the first pixel in this row that the filter affects. It will |
69 // touch |filter_length| pixels (4 bytes each) after this. | 62 // touch |filter_length| pixels (4 bytes each) after this. |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
113 accum = _mm_add_epi32(accum, t); | 106 accum = _mm_add_epi32(accum, t); |
114 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | 107 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
115 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 108 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
116 accum = _mm_add_epi32(accum, t); | 109 accum = _mm_add_epi32(accum, t); |
117 | 110 |
118 // Advance the pixel and coefficients pointers. | 111 // Advance the pixel and coefficients pointers. |
119 row_to_filter += 1; | 112 row_to_filter += 1; |
120 filter_values += 4; | 113 filter_values += 4; |
121 } | 114 } |
122 | 115 |
123 // When |filter_length| is not divisible by 4, we need to decimate some of | 116 // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3 |
124 // the filter coefficient that was loaded incorrectly to zero; Other tha n | 117 // coefficients one at a time. |
125 // that the algorithm is same with above, exceot that the 4th pixel will be | 118 int r = filter_length & 3; |
126 // always absent. | |
127 int r = filter_length&3; | |
128 if (r) { | 119 if (r) { |
129 // Note: filter_values must be padded to align_up(filter_offset, 8). | 120 #define ACCUM_REMAINDER(src, accum) { \ |
mtklein_C
2016/11/09 09:25:04
Can we make these ACCUM_REMAINDERs lambdas or stat
xiangze.zhang
2016/11/10 03:16:56
Done.
| |
130 __m128i coeff, coeff16; | 121 int remainder[4] = {0}; \ |
131 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es)); | 122 const unsigned char* pixels_left = src + (filter_offset + filter_len gth - r) * 4; \ |
132 // Mask out extra filter taps. | 123 for (int i = 0; i < r; i++) { \ |
133 coeff = _mm_and_si128(coeff, mask[r]); | 124 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i] ; \ |
134 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | 125 remainder[0] += coeff * pixels_left[i * 4 + 0]; \ |
135 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | 126 remainder[1] += coeff * pixels_left[i * 4 + 1]; \ |
136 | 127 remainder[2] += coeff * pixels_left[i * 4 + 2]; \ |
137 // Note: line buffer must be padded to align_up(filter_offset, 16). | 128 remainder[3] += coeff * pixels_left[i * 4 + 3]; \ |
138 // We resolve this by use C-version for the last horizontal line. | 129 } \ |
139 __m128i src8 = _mm_loadu_si128(row_to_filter); | 130 __m128i t = _mm_set_epi32(remainder[3], remainder[2], remainder[1], remainder[0]); \ |
mtklein_C
2016/11/09 09:25:04
This is not a big deal, but we have been trending
xiangze.zhang
2016/11/10 03:16:56
Done.
| |
140 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | 131 accum = _mm_add_epi32(accum, t); } |
mtklein_C
2016/11/09 09:25:04
If we do need to use macros, it's probably best to
| |
141 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | 132 ACCUM_REMAINDER(src_data, accum); |
142 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
143 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
144 accum = _mm_add_epi32(accum, t); | |
145 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
146 accum = _mm_add_epi32(accum, t); | |
147 | |
148 src16 = _mm_unpackhi_epi8(src8, zero); | |
149 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
150 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
151 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
152 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
153 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
154 accum = _mm_add_epi32(accum, t); | |
155 } | 133 } |
156 | 134 |
157 // Shift right for fixed point implementation. | 135 // Shift right for fixed point implementation. |
158 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | 136 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
159 | 137 |
160 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 138 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
161 accum = _mm_packs_epi32(accum, zero); | 139 accum = _mm_packs_epi32(accum, zero); |
162 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 140 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
163 accum = _mm_packus_epi16(accum, zero); | 141 accum = _mm_packus_epi16(accum, zero); |
164 | 142 |
(...skipping 10 matching lines...) Expand all Loading... | |
175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 153 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
176 const SkConvolutionFilter1D& filter, | 154 const SkConvolutionFilter1D& filter, |
177 unsigned char* out_row[4], | 155 unsigned char* out_row[4], |
178 size_t outRowBytes) { | 156 size_t outRowBytes) { |
179 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) | 157 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) |
180 | 158 |
181 int num_values = filter.numValues(); | 159 int num_values = filter.numValues(); |
182 | 160 |
183 int filter_offset, filter_length; | 161 int filter_offset, filter_length; |
184 __m128i zero = _mm_setzero_si128(); | 162 __m128i zero = _mm_setzero_si128(); |
185 __m128i mask[4]; | |
186 // |mask| will be used to decimate all extra filter coefficients that are | |
187 // loaded by SIMD when |filter_length| is not divisible by 4. | |
188 // mask[0] is not used in following algorithm. | |
189 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
190 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
191 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
192 | 163 |
193 // Output one pixel each iteration, calculating all channels (RGBA) together . | 164 // Output one pixel each iteration, calculating all channels (RGBA) together . |
194 for (int out_x = 0; out_x < num_values; out_x++) { | 165 for (int out_x = 0; out_x < num_values; out_x++) { |
195 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 166 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
196 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 167 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
197 | 168 |
198 // four pixels in a column per iteration. | 169 // four pixels in a column per iteration. |
199 __m128i accum0 = _mm_setzero_si128(); | 170 __m128i accum0 = _mm_setzero_si128(); |
200 __m128i accum1 = _mm_setzero_si128(); | 171 __m128i accum1 = _mm_setzero_si128(); |
201 __m128i accum2 = _mm_setzero_si128(); | 172 __m128i accum2 = _mm_setzero_si128(); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
238 ITERATION(src_data[1] + start, accum1); | 209 ITERATION(src_data[1] + start, accum1); |
239 ITERATION(src_data[2] + start, accum2); | 210 ITERATION(src_data[2] + start, accum2); |
240 ITERATION(src_data[3] + start, accum3); | 211 ITERATION(src_data[3] + start, accum3); |
241 | 212 |
242 start += 16; | 213 start += 16; |
243 filter_values += 4; | 214 filter_values += 4; |
244 } | 215 } |
245 | 216 |
246 int r = filter_length & 3; | 217 int r = filter_length & 3; |
247 if (r) { | 218 if (r) { |
248 // Note: filter_values must be padded to align_up(filter_offset, 8); | 219 ACCUM_REMAINDER(src_data[0], accum0); |
249 __m128i coeff; | 220 ACCUM_REMAINDER(src_data[1], accum1); |
250 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es)); | 221 ACCUM_REMAINDER(src_data[2], accum2); |
251 // Mask out extra filter taps. | 222 ACCUM_REMAINDER(src_data[3], accum3); |
252 coeff = _mm_and_si128(coeff, mask[r]); | |
253 | |
254 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
255 /* c1 c1 c1 c1 c0 c0 c0 c0 */ | |
256 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
257 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
258 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
259 | |
260 __m128i src8, src16, mul_hi, mul_lo, t; | |
261 | |
262 ITERATION(src_data[0] + start, accum0); | |
263 ITERATION(src_data[1] + start, accum1); | |
264 ITERATION(src_data[2] + start, accum2); | |
265 ITERATION(src_data[3] + start, accum3); | |
266 } | 223 } |
267 | 224 |
268 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 225 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
269 accum0 = _mm_packs_epi32(accum0, zero); | 226 accum0 = _mm_packs_epi32(accum0, zero); |
270 accum0 = _mm_packus_epi16(accum0, zero); | 227 accum0 = _mm_packus_epi16(accum0, zero); |
271 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 228 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
272 accum1 = _mm_packs_epi32(accum1, zero); | 229 accum1 = _mm_packs_epi32(accum1, zero); |
273 accum1 = _mm_packus_epi16(accum1, zero); | 230 accum1 = _mm_packus_epi16(accum1, zero); |
274 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 231 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
275 accum2 = _mm_packs_epi32(accum2, zero); | 232 accum2 = _mm_packs_epi32(accum2, zero); |
(...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
480 pixel_width, | 437 pixel_width, |
481 out_row); | 438 out_row); |
482 } else { | 439 } else { |
483 convolveVertically_SSE2<false>(filter_values, | 440 convolveVertically_SSE2<false>(filter_values, |
484 filter_length, | 441 filter_length, |
485 source_data_rows, | 442 source_data_rows, |
486 pixel_width, | 443 pixel_width, |
487 out_row); | 444 out_row); |
488 } | 445 } |
489 } | 446 } |
490 | |
491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | |
492 // Padding |paddingCount| of more dummy coefficients after the coefficients | |
493 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | |
494 // together to access invalid memory areas. We are not trying to align the | |
495 // coefficients right now due to the opaqueness of <vector> implementation. | |
496 // This has to be done after all |AddFilter| calls. | |
497 for (int i = 0; i < 8; ++i) { | |
498 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0)); | |
499 } | |
500 } | |
OLD | NEW |