OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
| 8 #include <emmintrin.h> |
| 9 #include "SkBitmap.h" |
| 10 #include "SkBitmapFilter_opts_SSE2.h" |
8 #include "SkBitmapProcState.h" | 11 #include "SkBitmapProcState.h" |
9 #include "SkBitmap.h" | |
10 #include "SkColor.h" | 12 #include "SkColor.h" |
11 #include "SkColorPriv.h" | 13 #include "SkColorPriv.h" |
| 14 #include "SkConvolver.h" |
| 15 #include "SkShader.h" |
12 #include "SkUnPreMultiply.h" | 16 #include "SkUnPreMultiply.h" |
13 #include "SkShader.h" | |
14 #include "SkConvolver.h" | |
15 | |
16 #include "SkBitmapFilter_opts_SSE2.h" | |
17 | |
18 #include <emmintrin.h> | |
19 | 17 |
20 #if 0 | 18 #if 0 |
21 static inline void print128i(__m128i value) { | 19 static inline void print128i(__m128i value) { |
22 int *v = (int*) &value; | 20 int *v = (int*) &value; |
23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); | 21 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |
24 } | 22 } |
25 | 23 |
26 static inline void print128i_16(__m128i value) { | 24 static inline void print128i_16(__m128i value) { |
27 short *v = (short*) &value; | 25 short *v = (short*) &value; |
28 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
, v[3], v[4], v[5], v[6], v[7]); | 26 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
, v[3], v[4], v[5], v[6], v[7]); |
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
168 int r = SkClampMax(int(localResult[1]), a); | 166 int r = SkClampMax(int(localResult[1]), a); |
169 int g = SkClampMax(int(localResult[2]), a); | 167 int g = SkClampMax(int(localResult[2]), a); |
170 int b = SkClampMax(int(localResult[3]), a); | 168 int b = SkClampMax(int(localResult[3]), a); |
171 | 169 |
172 *colors++ = SkPackARGB32(a, r, g, b); | 170 *colors++ = SkPackARGB32(a, r, g, b); |
173 | 171 |
174 x++; | 172 x++; |
175 | 173 |
176 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), | 174 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), |
177 SkIntToScalar(y), &srcPt); | 175 SkIntToScalar(y), &srcPt); |
178 | |
179 } | 176 } |
180 } | 177 } |
181 | 178 |
182 // Convolves horizontally along a single row. The row data is given in | 179 // Convolves horizontally along a single row. The row data is given in |
183 // |src_data| and continues for the num_values() of the filter. | 180 // |src_data| and continues for the num_values() of the filter. |
184 void convolveHorizontally_SSE2(const unsigned char* src_data, | 181 void convolveHorizontally_SSE2(const unsigned char* src_data, |
185 const SkConvolutionFilter1D& filter, | 182 const SkConvolutionFilter1D& filter, |
186 unsigned char* out_row, | 183 unsigned char* out_row, |
187 bool /*has_alpha*/) { | 184 bool /*has_alpha*/) { |
188 int num_values = filter.numValues(); | 185 int num_values = filter.numValues(); |
189 | 186 |
190 int filter_offset, filter_length; | 187 int filter_offset, filter_length; |
191 __m128i zero = _mm_setzero_si128(); | 188 __m128i zero = _mm_setzero_si128(); |
192 __m128i mask[4]; | 189 __m128i mask[4]; |
193 // |mask| will be used to decimate all extra filter coefficients that are | 190 // |mask| will be used to decimate all extra filter coefficients that are |
194 // loaded by SIMD when |filter_length| is not divisible by 4. | 191 // loaded by SIMD when |filter_length| is not divisible by 4. |
195 // mask[0] is not used in following algorithm. | 192 // mask[0] is not used in following algorithm. |
196 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | 193 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
197 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | 194 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
198 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | 195 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
199 | 196 |
200 // Output one pixel each iteration, calculating all channels (RGBA) together. | 197 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
201 for (int out_x = 0; out_x < num_values; out_x++) { | 198 for (int out_x = 0; out_x < num_values; out_x++) { |
202 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 199 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
203 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 200 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
204 | 201 |
205 __m128i accum = _mm_setzero_si128(); | 202 __m128i accum = _mm_setzero_si128(); |
206 | 203 |
207 // Compute the first pixel in this row that the filter affects. It will | 204 // Compute the first pixel in this row that the filter affects. It will |
208 // touch |filter_length| pixels (4 bytes each) after this. | 205 // touch |filter_length| pixels (4 bytes each) after this. |
209 const __m128i* row_to_filter = | 206 const __m128i* row_to_filter = |
210 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); | 207 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); |
211 | 208 |
212 // We will load and accumulate with four coefficients per iteration. | 209 // We will load and accumulate with four coefficients per iteration. |
213 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { | 210 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { |
214 | 211 |
215 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. | 212 // Load 4 coefficients => duplicate 1st and 2nd of them for all chan
nels. |
216 __m128i coeff, coeff16; | 213 __m128i coeff, coeff16; |
217 // [16] xx xx xx xx c3 c2 c1 c0 | 214 // [16] xx xx xx xx c3 c2 c1 c0 |
218 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | 215 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
219 // [16] xx xx xx xx c1 c1 c0 c0 | 216 // [16] xx xx xx xx c1 c1 c0 c0 |
220 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | 217 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
221 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | 218 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
222 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | 219 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
223 | 220 |
224 // Load four pixels => unpack the first two pixels to 16 bits => | 221 // Load four pixels => unpack the first two pixels to 16 bits => |
225 // multiply with coefficients => accumulate the convolution result. | 222 // multiply with coefficients => accumulate the convolution result. |
226 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 223 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
227 __m128i src8 = _mm_loadu_si128(row_to_filter); | 224 __m128i src8 = _mm_loadu_si128(row_to_filter); |
228 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 225 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
229 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | 226 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
230 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | 227 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
231 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | 228 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
232 // [32] a0*c0 b0*c0 g0*c0 r0*c0 | 229 // [32] a0*c0 b0*c0 g0*c0 r0*c0 |
233 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 230 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
234 accum = _mm_add_epi32(accum, t); | 231 accum = _mm_add_epi32(accum, t); |
235 // [32] a1*c1 b1*c1 g1*c1 r1*c1 | 232 // [32] a1*c1 b1*c1 g1*c1 r1*c1 |
236 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 233 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
237 accum = _mm_add_epi32(accum, t); | 234 accum = _mm_add_epi32(accum, t); |
238 | 235 |
239 // Duplicate 3rd and 4th coefficients for all channels => | 236 // Duplicate 3rd and 4th coefficients for all channels => |
240 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients | 237 // unpack the 3rd and 4th pixels to 16 bits => multiply with coeffic
ients |
241 // => accumulate the convolution results. | 238 // => accumulate the convolution results. |
242 // [16] xx xx xx xx c3 c3 c2 c2 | 239 // [16] xx xx xx xx c3 c3 c2 c2 |
243 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | 240 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
244 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | 241 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
245 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | 242 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
246 // [16] a3 g3 b3 r3 a2 g2 b2 r2 | 243 // [16] a3 g3 b3 r3 a2 g2 b2 r2 |
247 src16 = _mm_unpackhi_epi8(src8, zero); | 244 src16 = _mm_unpackhi_epi8(src8, zero); |
248 mul_hi = _mm_mulhi_epi16(src16, coeff16); | 245 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
249 mul_lo = _mm_mullo_epi16(src16, coeff16); | 246 mul_lo = _mm_mullo_epi16(src16, coeff16); |
250 // [32] a2*c2 b2*c2 g2*c2 r2*c2 | 247 // [32] a2*c2 b2*c2 g2*c2 r2*c2 |
251 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 248 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
252 accum = _mm_add_epi32(accum, t); | 249 accum = _mm_add_epi32(accum, t); |
253 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | 250 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
254 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 251 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
255 accum = _mm_add_epi32(accum, t); | 252 accum = _mm_add_epi32(accum, t); |
256 | 253 |
257 // Advance the pixel and coefficients pointers. | 254 // Advance the pixel and coefficients pointers. |
258 row_to_filter += 1; | 255 row_to_filter += 1; |
259 filter_values += 4; | 256 filter_values += 4; |
| 257 } |
| 258 |
| 259 // When |filter_length| is not divisible by 4, we need to decimate some
of |
| 260 // the filter coefficient that was loaded incorrectly to zero; Other tha
n |
| 261 // that the algorithm is same with above, exceot that the 4th pixel will
be |
| 262 // always absent. |
| 263 int r = filter_length&3; |
| 264 if (r) { |
| 265 // Note: filter_values must be padded to align_up(filter_offset, 8). |
| 266 __m128i coeff, coeff16; |
| 267 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 268 // Mask out extra filter taps. |
| 269 coeff = _mm_and_si128(coeff, mask[r]); |
| 270 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 271 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 272 |
| 273 // Note: line buffer must be padded to align_up(filter_offset, 16). |
| 274 // We resolve this by use C-version for the last horizontal line. |
| 275 __m128i src8 = _mm_loadu_si128(row_to_filter); |
| 276 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 277 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 278 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 279 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 280 accum = _mm_add_epi32(accum, t); |
| 281 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 282 accum = _mm_add_epi32(accum, t); |
| 283 |
| 284 src16 = _mm_unpackhi_epi8(src8, zero); |
| 285 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 286 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 287 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 288 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 289 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 290 accum = _mm_add_epi32(accum, t); |
| 291 } |
| 292 |
| 293 // Shift right for fixed point implementation. |
| 294 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
| 295 |
| 296 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 297 accum = _mm_packs_epi32(accum, zero); |
| 298 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 299 accum = _mm_packus_epi16(accum, zero); |
| 300 |
| 301 // Store the pixel value of 32 bits. |
| 302 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); |
| 303 out_row += 4; |
260 } | 304 } |
261 | |
262 // When |filter_length| is not divisible by 4, we need to decimate some of | |
263 // the filter coefficient that was loaded incorrectly to zero; Other than | |
264 // that the algorithm is same with above, exceot that the 4th pixel will be | |
265 // always absent. | |
266 int r = filter_length&3; | |
267 if (r) { | |
268 // Note: filter_values must be padded to align_up(filter_offset, 8). | |
269 __m128i coeff, coeff16; | |
270 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
271 // Mask out extra filter taps. | |
272 coeff = _mm_and_si128(coeff, mask[r]); | |
273 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
274 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
275 | |
276 // Note: line buffer must be padded to align_up(filter_offset, 16). | |
277 // We resolve this by use C-version for the last horizontal line. | |
278 __m128i src8 = _mm_loadu_si128(row_to_filter); | |
279 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
280 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
281 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
282 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
283 accum = _mm_add_epi32(accum, t); | |
284 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
285 accum = _mm_add_epi32(accum, t); | |
286 | |
287 src16 = _mm_unpackhi_epi8(src8, zero); | |
288 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
289 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
290 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
291 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
292 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
293 accum = _mm_add_epi32(accum, t); | |
294 } | |
295 | |
296 // Shift right for fixed point implementation. | |
297 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | |
298 | |
299 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | |
300 accum = _mm_packs_epi32(accum, zero); | |
301 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | |
302 accum = _mm_packus_epi16(accum, zero); | |
303 | |
304 // Store the pixel value of 32 bits. | |
305 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); | |
306 out_row += 4; | |
307 } | |
308 } | 305 } |
309 | 306 |
310 // Convolves horizontally along four rows. The row data is given in | 307 // Convolves horizontally along four rows. The row data is given in |
311 // |src_data| and continues for the num_values() of the filter. | 308 // |src_data| and continues for the num_values() of the filter. |
312 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please | 309 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |
313 // refer to that function for detailed comments. | 310 // refer to that function for detailed comments. |
314 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 311 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
315 const SkConvolutionFilter1D& filter, | 312 const SkConvolutionFilter1D& filter, |
316 unsigned char* out_row[4]) { | 313 unsigned char* out_row[4]) { |
317 int num_values = filter.numValues(); | 314 int num_values = filter.numValues(); |
318 | 315 |
319 int filter_offset, filter_length; | 316 int filter_offset, filter_length; |
320 __m128i zero = _mm_setzero_si128(); | 317 __m128i zero = _mm_setzero_si128(); |
321 __m128i mask[4]; | 318 __m128i mask[4]; |
322 // |mask| will be used to decimate all extra filter coefficients that are | 319 // |mask| will be used to decimate all extra filter coefficients that are |
323 // loaded by SIMD when |filter_length| is not divisible by 4. | 320 // loaded by SIMD when |filter_length| is not divisible by 4. |
324 // mask[0] is not used in following algorithm. | 321 // mask[0] is not used in following algorithm. |
325 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | 322 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
326 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | 323 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
327 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | 324 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
328 | 325 |
329 // Output one pixel each iteration, calculating all channels (RGBA) together. | 326 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
330 for (int out_x = 0; out_x < num_values; out_x++) { | 327 for (int out_x = 0; out_x < num_values; out_x++) { |
331 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 328 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
332 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 329 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
333 | 330 |
334 // four pixels in a column per iteration. | 331 // four pixels in a column per iteration. |
335 __m128i accum0 = _mm_setzero_si128(); | 332 __m128i accum0 = _mm_setzero_si128(); |
336 __m128i accum1 = _mm_setzero_si128(); | 333 __m128i accum1 = _mm_setzero_si128(); |
337 __m128i accum2 = _mm_setzero_si128(); | 334 __m128i accum2 = _mm_setzero_si128(); |
338 __m128i accum3 = _mm_setzero_si128(); | 335 __m128i accum3 = _mm_setzero_si128(); |
339 int start = (filter_offset<<2); | 336 int start = (filter_offset<<2); |
340 // We will load and accumulate with four coefficients per iteration. | 337 // We will load and accumulate with four coefficients per iteration. |
341 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { | 338 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { |
342 __m128i coeff, coeff16lo, coeff16hi; | 339 __m128i coeff, coeff16lo, coeff16hi; |
343 // [16] xx xx xx xx c3 c2 c1 c0 | 340 // [16] xx xx xx xx c3 c2 c1 c0 |
344 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | 341 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
345 // [16] xx xx xx xx c1 c1 c0 c0 | 342 // [16] xx xx xx xx c1 c1 c0 c0 |
346 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | 343 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
347 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | 344 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
348 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | 345 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
349 // [16] xx xx xx xx c3 c3 c2 c2 | 346 // [16] xx xx xx xx c3 c3 c2 c2 |
350 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | 347 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
351 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | 348 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
352 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | 349 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
353 | 350 |
354 __m128i src8, src16, mul_hi, mul_lo, t; | 351 __m128i src8, src16, mul_hi, mul_lo, t; |
355 | 352 |
356 #define ITERATION(src, accum) \ | 353 #define ITERATION(src, accum) \ |
357 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ | 354 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ |
358 src16 = _mm_unpacklo_epi8(src8, zero); \ | 355 src16 = _mm_unpacklo_epi8(src8, zero); \ |
359 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ | 356 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ |
360 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ | 357 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ |
361 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | 358 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
362 accum = _mm_add_epi32(accum, t); \ | 359 accum = _mm_add_epi32(accum, t); \ |
363 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | 360 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
364 accum = _mm_add_epi32(accum, t); \ | 361 accum = _mm_add_epi32(accum, t); \ |
365 src16 = _mm_unpackhi_epi8(src8, zero); \ | 362 src16 = _mm_unpackhi_epi8(src8, zero); \ |
366 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ | 363 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ |
367 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ | 364 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ |
368 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | 365 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
369 accum = _mm_add_epi32(accum, t); \ | 366 accum = _mm_add_epi32(accum, t); \ |
370 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | 367 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
371 accum = _mm_add_epi32(accum, t) | 368 accum = _mm_add_epi32(accum, t) |
372 | 369 |
373 ITERATION(src_data[0] + start, accum0); | 370 ITERATION(src_data[0] + start, accum0); |
374 ITERATION(src_data[1] + start, accum1); | 371 ITERATION(src_data[1] + start, accum1); |
375 ITERATION(src_data[2] + start, accum2); | 372 ITERATION(src_data[2] + start, accum2); |
376 ITERATION(src_data[3] + start, accum3); | 373 ITERATION(src_data[3] + start, accum3); |
377 | 374 |
378 start += 16; | 375 start += 16; |
379 filter_values += 4; | 376 filter_values += 4; |
| 377 } |
| 378 |
| 379 int r = filter_length & 3; |
| 380 if (r) { |
| 381 // Note: filter_values must be padded to align_up(filter_offset, 8); |
| 382 __m128i coeff; |
| 383 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 384 // Mask out extra filter taps. |
| 385 coeff = _mm_and_si128(coeff, mask[r]); |
| 386 |
| 387 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0,
0)); |
| 388 /* c1 c1 c1 c1 c0 c0 c0 c0 */ |
| 389 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 390 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2,
2)); |
| 391 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 392 |
| 393 __m128i src8, src16, mul_hi, mul_lo, t; |
| 394 |
| 395 ITERATION(src_data[0] + start, accum0); |
| 396 ITERATION(src_data[1] + start, accum1); |
| 397 ITERATION(src_data[2] + start, accum2); |
| 398 ITERATION(src_data[3] + start, accum3); |
| 399 } |
| 400 |
| 401 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 402 accum0 = _mm_packs_epi32(accum0, zero); |
| 403 accum0 = _mm_packus_epi16(accum0, zero); |
| 404 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 405 accum1 = _mm_packs_epi32(accum1, zero); |
| 406 accum1 = _mm_packus_epi16(accum1, zero); |
| 407 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 408 accum2 = _mm_packs_epi32(accum2, zero); |
| 409 accum2 = _mm_packus_epi16(accum2, zero); |
| 410 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 411 accum3 = _mm_packs_epi32(accum3, zero); |
| 412 accum3 = _mm_packus_epi16(accum3, zero); |
| 413 |
| 414 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |
| 415 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |
| 416 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |
| 417 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |
| 418 |
| 419 out_row[0] += 4; |
| 420 out_row[1] += 4; |
| 421 out_row[2] += 4; |
| 422 out_row[3] += 4; |
380 } | 423 } |
381 | |
382 int r = filter_length & 3; | |
383 if (r) { | |
384 // Note: filter_values must be padded to align_up(filter_offset, 8); | |
385 __m128i coeff; | |
386 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
387 // Mask out extra filter taps. | |
388 coeff = _mm_and_si128(coeff, mask[r]); | |
389 | |
390 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
391 /* c1 c1 c1 c1 c0 c0 c0 c0 */ | |
392 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
393 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
394 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
395 | |
396 __m128i src8, src16, mul_hi, mul_lo, t; | |
397 | |
398 ITERATION(src_data[0] + start, accum0); | |
399 ITERATION(src_data[1] + start, accum1); | |
400 ITERATION(src_data[2] + start, accum2); | |
401 ITERATION(src_data[3] + start, accum3); | |
402 } | |
403 | |
404 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
405 accum0 = _mm_packs_epi32(accum0, zero); | |
406 accum0 = _mm_packus_epi16(accum0, zero); | |
407 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
408 accum1 = _mm_packs_epi32(accum1, zero); | |
409 accum1 = _mm_packus_epi16(accum1, zero); | |
410 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
411 accum2 = _mm_packs_epi32(accum2, zero); | |
412 accum2 = _mm_packus_epi16(accum2, zero); | |
413 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
414 accum3 = _mm_packs_epi32(accum3, zero); | |
415 accum3 = _mm_packus_epi16(accum3, zero); | |
416 | |
417 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); | |
418 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); | |
419 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); | |
420 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); | |
421 | |
422 out_row[0] += 4; | |
423 out_row[1] += 4; | |
424 out_row[2] += 4; | |
425 out_row[3] += 4; | |
426 } | |
427 } | 424 } |
428 | 425 |
429 // Does vertical convolution to produce one output row. The filter values and | 426 // Does vertical convolution to produce one output row. The filter values and |
430 // length are given in the first two parameters. These are applied to each | 427 // length are given in the first two parameters. These are applied to each |
431 // of the rows pointed to in the |source_data_rows| array, with each row | 428 // of the rows pointed to in the |source_data_rows| array, with each row |
432 // being |pixel_width| wide. | 429 // being |pixel_width| wide. |
433 // | 430 // |
434 // The output must have room for |pixel_width * 4| bytes. | 431 // The output must have room for |pixel_width * 4| bytes. |
435 template<bool has_alpha> | 432 template<bool has_alpha> |
436 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, | 433 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, |
437 int filter_length, | 434 int filter_length, |
438 unsigned char* const* source_data_rows, | 435 unsigned char* const* source_data_rows, |
439 int pixel_width, | 436 int pixel_width, |
440 unsigned char* out_row) { | 437 unsigned char* out_row) { |
441 int width = pixel_width & ~3; | 438 int width = pixel_width & ~3; |
442 | 439 |
443 __m128i zero = _mm_setzero_si128(); | 440 __m128i zero = _mm_setzero_si128(); |
444 __m128i accum0, accum1, accum2, accum3, coeff16; | 441 __m128i accum0, accum1, accum2, accum3, coeff16; |
445 const __m128i* src; | 442 const __m128i* src; |
446 // Output four pixels per iteration (16 bytes). | 443 // Output four pixels per iteration (16 bytes). |
447 for (int out_x = 0; out_x < width; out_x += 4) { | 444 for (int out_x = 0; out_x < width; out_x += 4) { |
448 | 445 |
449 // Accumulated result for each pixel. 32 bits per RGBA channel. | 446 // Accumulated result for each pixel. 32 bits per RGBA channel. |
450 accum0 = _mm_setzero_si128(); | 447 accum0 = _mm_setzero_si128(); |
451 accum1 = _mm_setzero_si128(); | 448 accum1 = _mm_setzero_si128(); |
452 accum2 = _mm_setzero_si128(); | 449 accum2 = _mm_setzero_si128(); |
453 accum3 = _mm_setzero_si128(); | 450 accum3 = _mm_setzero_si128(); |
454 | 451 |
455 // Convolve with one filter coefficient per iteration. | 452 // Convolve with one filter coefficient per iteration. |
456 for (int filter_y = 0; filter_y < filter_length; filter_y++) { | 453 for (int filter_y = 0; filter_y < filter_length; filter_y++) { |
457 | 454 |
458 // Duplicate the filter coefficient 8 times. | 455 // Duplicate the filter coefficient 8 times. |
459 // [16] cj cj cj cj cj cj cj cj | 456 // [16] cj cj cj cj cj cj cj cj |
460 coeff16 = _mm_set1_epi16(filter_values[filter_y]); | 457 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
461 | 458 |
462 // Load four pixels (16 bytes) together. | 459 // Load four pixels (16 bytes) together. |
463 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 460 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
464 src = reinterpret_cast<const __m128i*>( | 461 src = reinterpret_cast<const __m128i*>( |
465 &source_data_rows[filter_y][out_x << 2]); | 462 &source_data_rows[filter_y][out_x << 2]); |
466 __m128i src8 = _mm_loadu_si128(src); | 463 __m128i src8 = _mm_loadu_si128(src); |
467 | 464 |
468 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => | 465 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channel
s => |
469 // multiply with current coefficient => accumulate the result. | 466 // multiply with current coefficient => accumulate the result. |
470 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 467 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
471 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | 468 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
472 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | 469 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
473 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | 470 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
474 // [32] a0 b0 g0 r0 | 471 // [32] a0 b0 g0 r0 |
475 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 472 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
476 accum0 = _mm_add_epi32(accum0, t); | 473 accum0 = _mm_add_epi32(accum0, t); |
477 // [32] a1 b1 g1 r1 | 474 // [32] a1 b1 g1 r1 |
478 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 475 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
479 accum1 = _mm_add_epi32(accum1, t); | 476 accum1 = _mm_add_epi32(accum1, t); |
480 | 477 |
481 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => | 478 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channel
s => |
482 // multiply with current coefficient => accumulate the result. | 479 // multiply with current coefficient => accumulate the result. |
483 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | 480 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
484 src16 = _mm_unpackhi_epi8(src8, zero); | 481 src16 = _mm_unpackhi_epi8(src8, zero); |
485 mul_hi = _mm_mulhi_epi16(src16, coeff16); | 482 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
486 mul_lo = _mm_mullo_epi16(src16, coeff16); | 483 mul_lo = _mm_mullo_epi16(src16, coeff16); |
487 // [32] a2 b2 g2 r2 | 484 // [32] a2 b2 g2 r2 |
488 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 485 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
489 accum2 = _mm_add_epi32(accum2, t); | 486 accum2 = _mm_add_epi32(accum2, t); |
490 // [32] a3 b3 g3 r3 | 487 // [32] a3 b3 g3 r3 |
491 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 488 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
492 accum3 = _mm_add_epi32(accum3, t); | 489 accum3 = _mm_add_epi32(accum3, t); |
| 490 } |
| 491 |
| 492 // Shift right for fixed point implementation. |
| 493 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 494 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 495 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 496 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 497 |
| 498 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 499 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 500 accum0 = _mm_packs_epi32(accum0, accum1); |
| 501 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 502 accum2 = _mm_packs_epi32(accum2, accum3); |
| 503 |
| 504 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 505 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 506 accum0 = _mm_packus_epi16(accum0, accum2); |
| 507 |
| 508 if (has_alpha) { |
| 509 // Compute the max(ri, gi, bi) for each pixel. |
| 510 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 511 __m128i a = _mm_srli_epi32(accum0, 8); |
| 512 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 513 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 514 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 515 a = _mm_srli_epi32(accum0, 16); |
| 516 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 517 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 518 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 519 b = _mm_slli_epi32(b, 24); |
| 520 |
| 521 // Make sure the value of alpha channel is always larger than maximu
m |
| 522 // value of color channels. |
| 523 accum0 = _mm_max_epu8(b, accum0); |
| 524 } else { |
| 525 // Set value of alpha channels to 0xFF. |
| 526 __m128i mask = _mm_set1_epi32(0xff000000); |
| 527 accum0 = _mm_or_si128(accum0, mask); |
| 528 } |
| 529 |
| 530 // Store the convolution result (16 bytes) and advance the pixel pointer
s. |
| 531 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); |
| 532 out_row += 16; |
493 } | 533 } |
494 | 534 |
495 // Shift right for fixed point implementation. | 535 // When the width of the output is not divisible by 4, We need to save one |
496 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 536 // pixel (4 bytes) each time. And also the fourth pixel is always absent. |
497 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 537 if (pixel_width & 3) { |
498 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 538 accum0 = _mm_setzero_si128(); |
499 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | 539 accum1 = _mm_setzero_si128(); |
| 540 accum2 = _mm_setzero_si128(); |
| 541 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { |
| 542 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
| 543 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 544 src = reinterpret_cast<const __m128i*>( |
| 545 &source_data_rows[filter_y][width<<2]); |
| 546 __m128i src8 = _mm_loadu_si128(src); |
| 547 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 548 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 549 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 550 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 551 // [32] a0 b0 g0 r0 |
| 552 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 553 accum0 = _mm_add_epi32(accum0, t); |
| 554 // [32] a1 b1 g1 r1 |
| 555 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 556 accum1 = _mm_add_epi32(accum1, t); |
| 557 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 558 src16 = _mm_unpackhi_epi8(src8, zero); |
| 559 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 560 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 561 // [32] a2 b2 g2 r2 |
| 562 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 563 accum2 = _mm_add_epi32(accum2, t); |
| 564 } |
500 | 565 |
501 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 566 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
502 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 567 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
503 accum0 = _mm_packs_epi32(accum0, accum1); | 568 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
504 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | 569 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
505 accum2 = _mm_packs_epi32(accum2, accum3); | 570 accum0 = _mm_packs_epi32(accum0, accum1); |
| 571 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 572 accum2 = _mm_packs_epi32(accum2, zero); |
| 573 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 574 accum0 = _mm_packus_epi16(accum0, accum2); |
| 575 if (has_alpha) { |
| 576 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 577 __m128i a = _mm_srli_epi32(accum0, 8); |
| 578 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 579 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 580 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 581 a = _mm_srli_epi32(accum0, 16); |
| 582 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 583 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 584 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 585 b = _mm_slli_epi32(b, 24); |
| 586 accum0 = _mm_max_epu8(b, accum0); |
| 587 } else { |
| 588 __m128i mask = _mm_set1_epi32(0xff000000); |
| 589 accum0 = _mm_or_si128(accum0, mask); |
| 590 } |
506 | 591 |
507 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 592 for (int out_x = width; out_x < pixel_width; out_x++) { |
508 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 593 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); |
509 accum0 = _mm_packus_epi16(accum0, accum2); | 594 accum0 = _mm_srli_si128(accum0, 4); |
510 | 595 out_row += 4; |
511 if (has_alpha) { | 596 } |
512 // Compute the max(ri, gi, bi) for each pixel. | |
513 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
514 __m128i a = _mm_srli_epi32(accum0, 8); | |
515 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
516 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
517 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
518 a = _mm_srli_epi32(accum0, 16); | |
519 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
520 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
521 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
522 b = _mm_slli_epi32(b, 24); | |
523 | |
524 // Make sure the value of alpha channel is always larger than maximum | |
525 // value of color channels. | |
526 accum0 = _mm_max_epu8(b, accum0); | |
527 } else { | |
528 // Set value of alpha channels to 0xFF. | |
529 __m128i mask = _mm_set1_epi32(0xff000000); | |
530 accum0 = _mm_or_si128(accum0, mask); | |
531 } | 597 } |
532 | |
533 // Store the convolution result (16 bytes) and advance the pixel pointers. | |
534 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); | |
535 out_row += 16; | |
536 } | |
537 | |
538 // When the width of the output is not divisible by 4, We need to save one | |
539 // pixel (4 bytes) each time. And also the fourth pixel is always absent. | |
540 if (pixel_width & 3) { | |
541 accum0 = _mm_setzero_si128(); | |
542 accum1 = _mm_setzero_si128(); | |
543 accum2 = _mm_setzero_si128(); | |
544 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { | |
545 coeff16 = _mm_set1_epi16(filter_values[filter_y]); | |
546 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
547 src = reinterpret_cast<const __m128i*>( | |
548 &source_data_rows[filter_y][width<<2]); | |
549 __m128i src8 = _mm_loadu_si128(src); | |
550 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
551 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
552 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
553 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
554 // [32] a0 b0 g0 r0 | |
555 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
556 accum0 = _mm_add_epi32(accum0, t); | |
557 // [32] a1 b1 g1 r1 | |
558 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
559 accum1 = _mm_add_epi32(accum1, t); | |
560 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
561 src16 = _mm_unpackhi_epi8(src8, zero); | |
562 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
563 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
564 // [32] a2 b2 g2 r2 | |
565 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
566 accum2 = _mm_add_epi32(accum2, t); | |
567 } | |
568 | |
569 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
570 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
571 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
572 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
573 accum0 = _mm_packs_epi32(accum0, accum1); | |
574 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
575 accum2 = _mm_packs_epi32(accum2, zero); | |
576 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
577 accum0 = _mm_packus_epi16(accum0, accum2); | |
578 if (has_alpha) { | |
579 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
580 __m128i a = _mm_srli_epi32(accum0, 8); | |
581 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
582 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
583 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
584 a = _mm_srli_epi32(accum0, 16); | |
585 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
586 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
587 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
588 b = _mm_slli_epi32(b, 24); | |
589 accum0 = _mm_max_epu8(b, accum0); | |
590 } else { | |
591 __m128i mask = _mm_set1_epi32(0xff000000); | |
592 accum0 = _mm_or_si128(accum0, mask); | |
593 } | |
594 | |
595 for (int out_x = width; out_x < pixel_width; out_x++) { | |
596 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); | |
597 accum0 = _mm_srli_si128(accum0, 4); | |
598 out_row += 4; | |
599 } | |
600 } | |
601 } | 598 } |
602 | 599 |
603 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, | 600 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, |
604 int filter_length, | 601 int filter_length, |
605 unsigned char* const* source_data_rows, | 602 unsigned char* const* source_data_rows, |
606 int pixel_width, | 603 int pixel_width, |
607 unsigned char* out_row, | 604 unsigned char* out_row, |
608 bool has_alpha) { | 605 bool has_alpha) { |
609 if (has_alpha) { | 606 if (has_alpha) { |
610 convolveVertically_SSE2<true>(filter_values, | 607 convolveVertically_SSE2<true>(filter_values, |
611 filter_length, | 608 filter_length, |
612 source_data_rows, | 609 source_data_rows, |
613 pixel_width, | 610 pixel_width, |
614 out_row); | 611 out_row); |
615 } else { | 612 } else { |
616 convolveVertically_SSE2<false>(filter_values, | 613 convolveVertically_SSE2<false>(filter_values, |
617 filter_length, | 614 filter_length, |
618 source_data_rows, | 615 source_data_rows, |
619 pixel_width, | 616 pixel_width, |
620 out_row); | 617 out_row); |
621 } | 618 } |
622 } | 619 } |
623 | 620 |
624 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 621 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
625 // Padding |paddingCount| of more dummy coefficients after the coefficients | 622 // Padding |paddingCount| of more dummy coefficients after the coefficients |
626 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 623 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
627 // together to access invalid memory areas. We are not trying to align the | 624 // together to access invalid memory areas. We are not trying to align the |
628 // coefficients right now due to the opaqueness of <vector> implementation. | 625 // coefficients right now due to the opaqueness of <vector> implementation. |
629 // This has to be done after all |AddFilter| calls. | 626 // This has to be done after all |AddFilter| calls. |
630 for (int i = 0; i < 8; ++i) { | 627 for (int i = 0; i < 8; ++i) { |
631 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 628 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
632 } | 629 } |
633 } | 630 } |
OLD | NEW |