Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(231)

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: brett's review Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <algorithm> 5 #include <algorithm>
6 6
7 #include "skia/ext/convolver.h" 7 #include "skia/ext/convolver.h"
8 #include "third_party/skia/include/core/SkTypes.h" 8 #include "third_party/skia/include/core/SkTypes.h"
9 9
10 #if defined(ARCH_CPU_X86_FAMILY)
11 #if defined(OS_WIN) || defined(__SSE2__)
12 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
fbarchard 2011/03/07 20:38:10 This header can't be used unless you enable SSE2.
jiesun 2011/03/07 20:55:24 my understanding is that on windows this is always
13 #endif
14 #endif
15
10 namespace skia { 16 namespace skia {
11 17
12 namespace { 18 namespace {
13 19
14 // Converts the argument to an 8-bit unsigned value by clamping to the range 20 // Converts the argument to an 8-bit unsigned value by clamping to the range
15 // 0-255. 21 // 0-255.
16 inline unsigned char ClampTo8(int a) { 22 inline unsigned char ClampTo8(int a) {
17 if (static_cast<unsigned>(a) < 256) 23 if (static_cast<unsigned>(a) < 256)
18 return a; // Avoid the extra check in the common case. 24 return a; // Avoid the extra check in the common case.
19 if (a < 0) 25 if (a < 0)
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
192 if (has_alpha) 198 if (has_alpha)
193 accum[3] >>= ConvolutionFilter1D::kShiftBits; 199 accum[3] >>= ConvolutionFilter1D::kShiftBits;
194 200
195 // Store the new pixel. 201 // Store the new pixel.
196 out_row[byte_offset + 0] = ClampTo8(accum[0]); 202 out_row[byte_offset + 0] = ClampTo8(accum[0]);
197 out_row[byte_offset + 1] = ClampTo8(accum[1]); 203 out_row[byte_offset + 1] = ClampTo8(accum[1]);
198 out_row[byte_offset + 2] = ClampTo8(accum[2]); 204 out_row[byte_offset + 2] = ClampTo8(accum[2]);
199 if (has_alpha) { 205 if (has_alpha) {
200 unsigned char alpha = ClampTo8(accum[3]); 206 unsigned char alpha = ClampTo8(accum[3]);
201 207
202 // Make sure the alpha channel doesn't come out larger than any of the 208 // Make sure the alpha channel doesn't come out smaller than any of the
203 // color channels. We use premultipled alpha channels, so this should 209 // color channels. We use premultipled alpha channels, so this should
204 // never happen, but rounding errors will cause this from time to time. 210 // never happen, but rounding errors will cause this from time to time.
205 // These "impossible" colors will cause overflows (and hence random pixel 211 // These "impossible" colors will cause overflows (and hence random pixel
206 // values) when the resulting bitmap is drawn to the screen. 212 // values) when the resulting bitmap is drawn to the screen.
207 // 213 //
208 // We only need to do this when generating the final output row (here). 214 // We only need to do this when generating the final output row (here).
209 int max_color_channel = std::max(out_row[byte_offset + 0], 215 int max_color_channel = std::max(out_row[byte_offset + 0],
210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2])); 216 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));
211 if (alpha < max_color_channel) 217 if (alpha < max_color_channel)
212 out_row[byte_offset + 3] = max_color_channel; 218 out_row[byte_offset + 3] = max_color_channel;
213 else 219 else
214 out_row[byte_offset + 3] = alpha; 220 out_row[byte_offset + 3] = alpha;
215 } else { 221 } else {
216 // No alpha channel, the image is opaque. 222 // No alpha channel, the image is opaque.
217 out_row[byte_offset + 3] = 0xff; 223 out_row[byte_offset + 3] = 0xff;
218 } 224 }
219 } 225 }
220 } 226 }
221 227
228
229 // Convolves horizontally along a single row. The row data is given in
230 // |src_data| and continues for the num_values() of the filter.
231 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
232 const ConvolutionFilter1D& filter,
233 unsigned char* out_row) {
234 #if defined(ARCH_CPU_X86_FAMILY)
235 #if defined(OS_WIN) || defined(__SSE2__)
236 int num_values = filter.num_values();
237
238 int filter_offset, filter_length;
239 __m128i zero = _mm_setzero_si128();
240 __m128i mask[4];
241 // |mask| will be used to decimate all extra filter coefficients that are
242 // loaded by SIMD when |filter_length| is not divisible by 4.
243 // mask[0] is not used in following algorithm.
244 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
245 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
246 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
247
248 // Output one pixel each iteration, calculating all channels (RGBA) together.
249 for (int out_x = 0; out_x < num_values; out_x++) {
250 const ConvolutionFilter1D::Fixed* filter_values =
251 filter.FilterForValue(out_x, &filter_offset, &filter_length);
252
253 __m128i accum = _mm_setzero_si128();
254
255 // Compute the first pixel in this row that the filter affects. It will
256 // touch |filter_length| pixels (4 bytes each) after this.
257 const __m128i* row_to_filter =
258 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
259
260 // We will load and accumulate with four coefficients per iteration.
261 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
262
263 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
264 __m128i coeff, coeff16;
265 // [16] xx xx xx xx c3 c2 c1 c0
266 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
267 // [16] xx xx xx xx c1 c1 c0 c0
268 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
269 // [16] c1 c1 c1 c1 c0 c0 c0 c0
270 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
271
272 // Load four pixels => unpack the first two pixels to 16 bits =>
273 // multiply with coefficients => accumulate the convolution result.
274 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
275 __m128i src8 = _mm_loadu_si128(row_to_filter);
276 // [16] a1 b1 g1 r1 a0 b0 g0 r0
277 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
278 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
279 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
280 // [32] a0*c0 b0*c0 g0*c0 r0*c0
281 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
282 accum = _mm_add_epi32(accum, t);
283 // [32] a1*c1 b1*c1 g1*c1 r1*c1
284 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
285 accum = _mm_add_epi32(accum, t);
286
287 // Duplicate 3rd and 4th coefficients for all channels =>
288 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
289 // => accumulate the convolution results.
290 // [16] xx xx xx xx c3 c3 c2 c2
291 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
292 // [16] c3 c3 c3 c3 c2 c2 c2 c2
293 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
294 // [16] a3 g3 b3 r3 a2 g2 b2 r2
295 src16 = _mm_unpackhi_epi8(src8, zero);
296 mul_hi = _mm_mulhi_epi16(src16, coeff16);
297 mul_lo = _mm_mullo_epi16(src16, coeff16);
298 // [32] a2*c2 b2*c2 g2*c2 r2*c2
299 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
300 accum = _mm_add_epi32(accum, t);
301 // [32] a3*c3 b3*c3 g3*c3 r3*c3
302 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
303 accum = _mm_add_epi32(accum, t);
304
305 // Advance the pixel and coefficients pointers.
306 row_to_filter += 1;
307 filter_values += 4;
308 }
309
310 // When |filter_length| is not divisible by 4, we need to decimate some of
311 // the filter coefficient that was loaded incorrectly to zero; Other than
312 // that the algorithm is same with above, exceot that the 4th pixel will be
313 // always absent.
314 int r = filter_length&3;
315 if (r) {
316 // Note: filter_values must be padded to align_up(filter_offset, 8).
317 __m128i coeff, coeff16;
318 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
319 // Mask out extra filter taps.
320 coeff = _mm_and_si128(coeff, mask[r]);
321 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
322 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
323
324 // Note: line buffer must be padded to align_up(filter_offset, 16).
325 // We resolve this by use C-version for the last horizontal line.
326 __m128i src8 = _mm_loadu_si128(row_to_filter);
327 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
328 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
329 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
330 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
331 accum = _mm_add_epi32(accum, t);
332 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
333 accum = _mm_add_epi32(accum, t);
334
335 src16 = _mm_unpackhi_epi8(src8, zero);
336 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
337 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
338 mul_hi = _mm_mulhi_epi16(src16, coeff16);
339 mul_lo = _mm_mullo_epi16(src16, coeff16);
340 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
341 accum = _mm_add_epi32(accum, t);
342 }
343
344 // Shift right for fixed point implementation.
345 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
346
347 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
348 accum = _mm_packs_epi32(accum, zero);
349 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
350 accum = _mm_packus_epi16(accum, zero);
351
352 // Store the pixel value of 32 bits.
353 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
354 out_row += 4;
355 }
356 #endif
357 #endif
358 }
359
360 // Convolves horizontally along four rows. The row data is given in
361 // |src_data| and continues for the num_values() of the filter.
362 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
363 // refer to that function for detailed comments.
364 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
365 const ConvolutionFilter1D& filter,
366 unsigned char* out_row[4]) {
367 #if defined(ARCH_CPU_X86_FAMILY)
368 #if defined(OS_WIN) || defined(__SSE2__)
369 int num_values = filter.num_values();
370
371 int filter_offset, filter_length;
372 __m128i zero = _mm_setzero_si128();
373 __m128i mask[4];
374 // |mask| will be used to decimate all extra filter coefficients that are
375 // loaded by SIMD when |filter_length| is not divisible by 4.
376 // mask[0] is not used in following algorithm.
377 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
378 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
379 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
380
381 // Output one pixel each iteration, calculating all channels (RGBA) together.
382 for (int out_x = 0; out_x < num_values; out_x++) {
383 const ConvolutionFilter1D::Fixed* filter_values =
384 filter.FilterForValue(out_x, &filter_offset, &filter_length);
385
386 // four pixels in a column per iteration.
387 __m128i accum0 = _mm_setzero_si128();
388 __m128i accum1 = _mm_setzero_si128();
389 __m128i accum2 = _mm_setzero_si128();
390 __m128i accum3 = _mm_setzero_si128();
391 int start = (filter_offset<<2);
392 // We will load and accumulate with four coefficients per iteration.
393 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
394 __m128i coeff, coeff16lo, coeff16hi;
395 // [16] xx xx xx xx c3 c2 c1 c0
396 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
397 // [16] xx xx xx xx c1 c1 c0 c0
398 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
399 // [16] c1 c1 c1 c1 c0 c0 c0 c0
400 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
401 // [16] xx xx xx xx c3 c3 c2 c2
402 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
403 // [16] c3 c3 c3 c3 c2 c2 c2 c2
404 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
405
406 __m128i src8, src16, mul_hi, mul_lo, t;
407
408 #define ITERATION(src, accum) \
409 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
410 src16 = _mm_unpacklo_epi8(src8, zero); \
411 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
412 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
413 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
414 accum = _mm_add_epi32(accum, t); \
415 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
416 accum = _mm_add_epi32(accum, t); \
417 src16 = _mm_unpackhi_epi8(src8, zero); \
418 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
419 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
420 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
421 accum = _mm_add_epi32(accum, t); \
422 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
423 accum = _mm_add_epi32(accum, t)
424
425 ITERATION(src_data[0] + start, accum0);
426 ITERATION(src_data[1] + start, accum1);
427 ITERATION(src_data[2] + start, accum2);
428 ITERATION(src_data[3] + start, accum3);
429
430 start += 16;
431 filter_values += 4;
432 }
433
434 int r = filter_length & 3;
435 if (r) {
436 // Note: filter_values must be padded to align_up(filter_offset, 8);
437 __m128i coeff;
438 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
439 // Mask out extra filter taps.
440 coeff = _mm_and_si128(coeff, mask[r]);
441
442 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
443 /* c1 c1 c1 c1 c0 c0 c0 c0 */
444 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
445 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
446 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
447
448 __m128i src8, src16, mul_hi, mul_lo, t;
449
450 ITERATION(src_data[0] + start, accum0);
451 ITERATION(src_data[1] + start, accum1);
452 ITERATION(src_data[2] + start, accum2);
453 ITERATION(src_data[3] + start, accum3);
454 }
455
456 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
457 accum0 = _mm_packs_epi32(accum0, zero);
458 accum0 = _mm_packus_epi16(accum0, zero);
459 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
460 accum1 = _mm_packs_epi32(accum1, zero);
461 accum1 = _mm_packus_epi16(accum1, zero);
462 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
463 accum2 = _mm_packs_epi32(accum2, zero);
464 accum2 = _mm_packus_epi16(accum2, zero);
465 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
466 accum3 = _mm_packs_epi32(accum3, zero);
467 accum3 = _mm_packus_epi16(accum3, zero);
468
469 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
470 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
471 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
472 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
473
474 out_row[0] += 4;
475 out_row[1] += 4;
476 out_row[2] += 4;
477 out_row[3] += 4;
478 }
479 #endif
480 #endif
481 }
482
483 // Does vertical convolution to produce one output row. The filter values and
484 // length are given in the first two parameters. These are applied to each
485 // of the rows pointed to in the |source_data_rows| array, with each row
486 // being |pixel_width| wide.
487 //
488 // The output must have room for |pixel_width * 4| bytes.
489 template<bool has_alpha>
490 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
491 int filter_length,
492 unsigned char* const* source_data_rows,
493 int pixel_width,
494 unsigned char* out_row) {
495 #if defined(ARCH_CPU_X86_FAMILY)
496 #if defined(OS_WIN) || defined(__SSE2__)
497 int width = pixel_width & ~3;
498
499 __m128i zero = _mm_setzero_si128();
500 __m128i accum0, accum1, accum2, accum3, coeff16;
501 const __m128i* src;
502 // Output four pixels per iteration (16 bytes).
503 for (int out_x = 0; out_x < width; out_x += 4) {
504
505 // Accumulated result for each pixel. 32 bits per RGBA channel.
506 accum0 = _mm_setzero_si128();
507 accum1 = _mm_setzero_si128();
508 accum2 = _mm_setzero_si128();
509 accum3 = _mm_setzero_si128();
510
511 // Convolve with one filter coefficient per iteration.
512 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
513
514 // Duplicate the filter coefficient 8 times.
515 // [16] cj cj cj cj cj cj cj cj
516 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
517
518 // Load four pixels (16 bytes) together.
519 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
520 src = reinterpret_cast<const __m128i*>(
521 &source_data_rows[filter_y][out_x << 2]);
522 __m128i src8 = _mm_loadu_si128(src);
523
524 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
525 // multiply with current coefficient => accumulate the result.
526 // [16] a1 b1 g1 r1 a0 b0 g0 r0
527 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
528 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
529 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
530 // [32] a0 b0 g0 r0
531 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
532 accum0 = _mm_add_epi32(accum0, t);
533 // [32] a1 b1 g1 r1
534 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
535 accum1 = _mm_add_epi32(accum1, t);
536
537 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
538 // multiply with current coefficient => accumulate the result.
539 // [16] a3 b3 g3 r3 a2 b2 g2 r2
540 src16 = _mm_unpackhi_epi8(src8, zero);
541 mul_hi = _mm_mulhi_epi16(src16, coeff16);
542 mul_lo = _mm_mullo_epi16(src16, coeff16);
543 // [32] a2 b2 g2 r2
544 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
545 accum2 = _mm_add_epi32(accum2, t);
546 // [32] a3 b3 g3 r3
547 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
548 accum3 = _mm_add_epi32(accum3, t);
549 }
550
551 // Shift right for fixed point implementation.
552 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
553 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
554 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
555 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
556
557 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
558 // [16] a1 b1 g1 r1 a0 b0 g0 r0
559 accum0 = _mm_packs_epi32(accum0, accum1);
560 // [16] a3 b3 g3 r3 a2 b2 g2 r2
561 accum2 = _mm_packs_epi32(accum2, accum3);
562
563 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
564 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
565 accum0 = _mm_packus_epi16(accum0, accum2);
566
567 if (has_alpha) {
568 // Compute the max(ri, gi, bi) for each pixel.
569 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
570 __m128i a = _mm_srli_epi32(accum0, 8);
571 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
572 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
573 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
574 a = _mm_srli_epi32(accum0, 16);
575 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
576 b = _mm_max_epu8(a, b); // Max of r and g and b.
577 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
578 b = _mm_slli_epi32(b, 24);
579
580 // Make sure the value of alpha channel is always larger than maximum
581 // value of color channels.
582 accum0 = _mm_max_epu8(b, accum0);
583 } else {
584 // Set value of alpha channels to 0xFF.
585 __m128i mask = _mm_set1_epi32(0xff000000);
586 accum0 = _mm_or_si128(accum0, mask);
587 }
588
589 // Store the convolution result (16 bytes) and advance the pixel pointers.
590 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
591 out_row += 16;
592 }
593
594 // When the width of the output is not divisible by 4, We need to save one
595 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
596 if (pixel_width & 3) {
597 accum0 = _mm_setzero_si128();
598 accum1 = _mm_setzero_si128();
599 accum2 = _mm_setzero_si128();
600 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
601 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
602 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
603 src = reinterpret_cast<const __m128i*>(
604 &source_data_rows[filter_y][width<<2]);
605 __m128i src8 = _mm_loadu_si128(src);
606 // [16] a1 b1 g1 r1 a0 b0 g0 r0
607 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
608 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
609 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
610 // [32] a0 b0 g0 r0
611 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
612 accum0 = _mm_add_epi32(accum0, t);
613 // [32] a1 b1 g1 r1
614 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
615 accum1 = _mm_add_epi32(accum1, t);
616 // [16] a3 b3 g3 r3 a2 b2 g2 r2
617 src16 = _mm_unpackhi_epi8(src8, zero);
618 mul_hi = _mm_mulhi_epi16(src16, coeff16);
619 mul_lo = _mm_mullo_epi16(src16, coeff16);
620 // [32] a2 b2 g2 r2
621 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
622 accum2 = _mm_add_epi32(accum2, t);
623 }
624
625 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
626 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
627 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
628 // [16] a1 b1 g1 r1 a0 b0 g0 r0
629 accum0 = _mm_packs_epi32(accum0, accum1);
630 // [16] a3 b3 g3 r3 a2 b2 g2 r2
631 accum2 = _mm_packs_epi32(accum2, zero);
632 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
633 accum0 = _mm_packus_epi16(accum0, accum2);
634 if (has_alpha) {
635 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
636 __m128i a = _mm_srli_epi32(accum0, 8);
637 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
638 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
639 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
640 a = _mm_srli_epi32(accum0, 16);
641 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
642 b = _mm_max_epu8(a, b); // Max of r and g and b.
643 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
644 b = _mm_slli_epi32(b, 24);
645 accum0 = _mm_max_epu8(b, accum0);
646 } else {
647 __m128i mask = _mm_set1_epi32(0xff000000);
648 accum0 = _mm_or_si128(accum0, mask);
649 }
650
651 for (int out_x = width; out_x < pixel_width; out_x++) {
652 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
653 accum0 = _mm_srli_si128(accum0, 4);
654 out_row += 4;
655 }
656 }
657 #endif
658 #endif
659 }
660
222 } // namespace 661 } // namespace
223 662
224 // ConvolutionFilter1D --------------------------------------------------------- 663 // ConvolutionFilter1D ---------------------------------------------------------
225 664
226 ConvolutionFilter1D::ConvolutionFilter1D() 665 ConvolutionFilter1D::ConvolutionFilter1D()
227 : max_filter_(0) { 666 : max_filter_(0) {
228 } 667 }
229 668
230 ConvolutionFilter1D::~ConvolutionFilter1D() { 669 ConvolutionFilter1D::~ConvolutionFilter1D() {
231 } 670 }
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
277 // We pushed filter_length elements onto filter_values_ 716 // We pushed filter_length elements onto filter_values_
278 instance.data_location = (static_cast<int>(filter_values_.size()) - 717 instance.data_location = (static_cast<int>(filter_values_.size()) -
279 filter_length); 718 filter_length);
280 instance.offset = filter_offset; 719 instance.offset = filter_offset;
281 instance.length = filter_length; 720 instance.length = filter_length;
282 filters_.push_back(instance); 721 filters_.push_back(instance);
283 722
284 max_filter_ = std::max(max_filter_, filter_length); 723 max_filter_ = std::max(max_filter_, filter_length);
285 } 724 }
286 725
287 // BGRAConvolve2D -------------------------------------------------------------
288
289 void BGRAConvolve2D(const unsigned char* source_data, 726 void BGRAConvolve2D(const unsigned char* source_data,
290 int source_byte_row_stride, 727 int source_byte_row_stride,
291 bool source_has_alpha, 728 bool source_has_alpha,
292 const ConvolutionFilter1D& filter_x, 729 const ConvolutionFilter1D& filter_x,
293 const ConvolutionFilter1D& filter_y, 730 const ConvolutionFilter1D& filter_y,
294 int output_byte_row_stride, 731 int output_byte_row_stride,
295 unsigned char* output) { 732 unsigned char* output,
733 bool use_sse2) {
734 #if defined(ARCH_CPU_X86_FAMILY)
735 #if !defined(OS_WIN) && !defined(__SSE2__)
736 // Even we have runtime support for SSE2 instructions, since the binary
737 // was not built with SSE2 support, we had to fallback to C version.
738 use_sse2 = false;
739 #endif
740 #endif
741
296 int max_y_filter_size = filter_y.max_filter(); 742 int max_y_filter_size = filter_y.max_filter();
297 743
298 // The next row in the input that we will generate a horizontally 744 // The next row in the input that we will generate a horizontally
299 // convolved row for. If the filter doesn't start at the beginning of the 745 // convolved row for. If the filter doesn't start at the beginning of the
300 // image (this is the case when we are only resizing a subset), then we 746 // image (this is the case when we are only resizing a subset), then we
301 // don't want to generate any output rows before that. Compute the starting 747 // don't want to generate any output rows before that. Compute the starting
302 // row for convolution as the first pixel for the first vertical filter. 748 // row for convolution as the first pixel for the first vertical filter.
303 int filter_offset, filter_length; 749 int filter_offset, filter_length;
304 const ConvolutionFilter1D::Fixed* filter_values = 750 const ConvolutionFilter1D::Fixed* filter_values =
305 filter_y.FilterForValue(0, &filter_offset, &filter_length); 751 filter_y.FilterForValue(0, &filter_offset, &filter_length);
306 int next_x_row = filter_offset; 752 int next_x_row = filter_offset;
307 753
308 // We loop over each row in the input doing a horizontal convolution. This 754 // We loop over each row in the input doing a horizontal convolution. This
309 // will result in a horizontally convolved image. We write the results into 755 // will result in a horizontally convolved image. We write the results into
310 // a circular buffer of convolved rows and do vertical convolution as rows 756 // a circular buffer of convolved rows and do vertical convolution as rows
311 // are available. This prevents us from having to store the entire 757 // are available. This prevents us from having to store the entire
312 // intermediate image and helps cache coherency. 758 // intermediate image and helps cache coherency.
313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size, 759 // We will need four extra rows to allow horizontal convolution could be done
760 // simultaneously. We also padding each row in row buffer to be aligned-up to
761 // 16 bytes.
762 // TODO(jiesun): We do not use aligned load from row buffer in vertical
763 // convolution pass yet. Somehow Windows does not like it.
764 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
765 int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
766 CircularRowBuffer row_buffer(row_buffer_width,
767 row_buffer_height,
314 filter_offset); 768 filter_offset);
315 769
316 // Loop over every possible output row, processing just enough horizontal 770 // Loop over every possible output row, processing just enough horizontal
317 // convolutions to run each subsequent vertical convolution. 771 // convolutions to run each subsequent vertical convolution.
318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); 772 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
319 int num_output_rows = filter_y.num_values(); 773 int num_output_rows = filter_y.num_values();
774
775 // We need to check which is the last line to convolve before we advance 4
776 // lines in one iteration.
777 int last_filter_offset, last_filter_length;
778 filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
779 &last_filter_length);
780
320 for (int out_y = 0; out_y < num_output_rows; out_y++) { 781 for (int out_y = 0; out_y < num_output_rows; out_y++) {
321 filter_values = filter_y.FilterForValue(out_y, 782 filter_values = filter_y.FilterForValue(out_y,
322 &filter_offset, &filter_length); 783 &filter_offset, &filter_length);
323 784
324 // Generate output rows until we have enough to run the current filter. 785 // Generate output rows until we have enough to run the current filter.
325 while (next_x_row < filter_offset + filter_length) { 786 if (use_sse2) {
326 if (source_has_alpha) { 787 while (next_x_row < filter_offset + filter_length) {
327 ConvolveHorizontally<true>( 788 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
328 &source_data[next_x_row * source_byte_row_stride], 789 const unsigned char* src[4];
329 filter_x, row_buffer.AdvanceRow()); 790 unsigned char* out_row[4];
330 } else { 791 for (int i = 0; i < 4; ++i) {
331 ConvolveHorizontally<false>( 792 src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
332 &source_data[next_x_row * source_byte_row_stride], 793 out_row[i] = row_buffer.AdvanceRow();
333 filter_x, row_buffer.AdvanceRow()); 794 }
795 ConvolveHorizontally4_SSE2(src, filter_x, out_row);
796 next_x_row += 4;
797 } else {
798 // For the last row, SSE2 load possibly to access data beyond the
799 // image area. therefore we use C version here.
800 if (next_x_row == last_filter_offset + last_filter_length - 1) {
801 if (source_has_alpha) {
802 ConvolveHorizontally<true>(
803 &source_data[next_x_row * source_byte_row_stride],
804 filter_x, row_buffer.AdvanceRow());
805 } else {
806 ConvolveHorizontally<false>(
807 &source_data[next_x_row * source_byte_row_stride],
808 filter_x, row_buffer.AdvanceRow());
809 }
810 } else {
811 ConvolveHorizontally_SSE2(
812 &source_data[next_x_row * source_byte_row_stride],
813 filter_x, row_buffer.AdvanceRow());
814 }
815 next_x_row++;
816 }
334 } 817 }
335 next_x_row++; 818 } else {
819 while (next_x_row < filter_offset + filter_length) {
820 if (source_has_alpha) {
821 ConvolveHorizontally<true>(
822 &source_data[next_x_row * source_byte_row_stride],
823 filter_x, row_buffer.AdvanceRow());
824 } else {
825 ConvolveHorizontally<false>(
826 &source_data[next_x_row * source_byte_row_stride],
827 filter_x, row_buffer.AdvanceRow());
828 }
829 next_x_row++;
830 }
336 } 831 }
337 832
338 // Compute where in the output image this row of final data will go. 833 // Compute where in the output image this row of final data will go.
339 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride]; 834 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
340 835
341 // Get the list of rows that the circular buffer has, in order. 836 // Get the list of rows that the circular buffer has, in order.
342 int first_row_in_circular_buffer; 837 int first_row_in_circular_buffer;
343 unsigned char* const* rows_to_convolve = 838 unsigned char* const* rows_to_convolve =
344 row_buffer.GetRowAddresses(&first_row_in_circular_buffer); 839 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
345 840
346 // Now compute the start of the subset of those rows that the filter 841 // Now compute the start of the subset of those rows that the filter
347 // needs. 842 // needs.
348 unsigned char* const* first_row_for_filter = 843 unsigned char* const* first_row_for_filter =
349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; 844 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
350 845
351 if (source_has_alpha) { 846 if (source_has_alpha) {
352 ConvolveVertically<true>(filter_values, filter_length, 847 if (use_sse2) {
353 first_row_for_filter, 848 ConvolveVertically_SSE2<true>(filter_values, filter_length,
354 filter_x.num_values(), cur_output_row); 849 first_row_for_filter,
850 filter_x.num_values(), cur_output_row);
851 } else {
852 ConvolveVertically<true>(filter_values, filter_length,
853 first_row_for_filter,
854 filter_x.num_values(), cur_output_row);
855 }
355 } else { 856 } else {
356 ConvolveVertically<false>(filter_values, filter_length, 857 if (use_sse2) {
357 first_row_for_filter, 858 ConvolveVertically_SSE2<false>(filter_values, filter_length,
358 filter_x.num_values(), cur_output_row); 859 first_row_for_filter,
860 filter_x.num_values(), cur_output_row);
861 } else {
862 ConvolveVertically<false>(filter_values, filter_length,
863 first_row_for_filter,
864 filter_x.num_values(), cur_output_row);
865 }
359 } 866 }
360 } 867 }
361 } 868 }
362 869
363 } // namespace skia 870 } // namespace skia
OLDNEW
« no previous file with comments | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698