Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(42)

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 19335002: Production quality fast image up/downsampler (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: clean up use of filter quality flags Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright 2013 Google Inc. 2 * Copyright 2013 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmapProcState.h" 8 #include "SkBitmapProcState.h"
9 #include "SkBitmap.h" 9 #include "SkBitmap.h"
10 #include "SkColor.h" 10 #include "SkColor.h"
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 #include "SkUnPreMultiply.h" 12 #include "SkUnPreMultiply.h"
13 #include "SkShader.h" 13 #include "SkShader.h"
14 #include "SkConvolver.h"
14 15
15 #include "SkBitmapFilter_opts_SSE2.h" 16 #include "SkBitmapFilter_opts_SSE2.h"
16 17
17 #include <emmintrin.h> 18 #include <emmintrin.h>
18 19
19 #if 0 20 #if 0
20 static inline void print128i(__m128i value) { 21 static inline void print128i(__m128i value) {
21 int *v = (int*) &value; 22 int *v = (int*) &value;
22 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); 23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
23 } 24 }
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after
173 174
174 *colors++ = SkPackARGB32(a, r, g, b); 175 *colors++ = SkPackARGB32(a, r, g, b);
175 176
176 x++; 177 x++;
177 178
178 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), 179 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
179 SkIntToScalar(y), &srcPt); 180 SkIntToScalar(y), &srcPt);
180 181
181 } 182 }
182 } 183 }
184
185 static void divideByWeights_SSE2(SkScalar *sums, SkScalar *weights, SkBitmap *ds t) {
186 for (int y = 0 ; y < dst->height() ; y++) {
187 for (int x = 0 ; x < dst->width() ; x++) {
188 SkScalar *sump = sums + 4*(y*dst->width() + x);
189 SkScalar weight = weights[y*dst->width() + x];
190
191 SkScalar fr = SkScalarDiv(sump[0], weight);
192 SkScalar fg = SkScalarDiv(sump[1], weight);
193 SkScalar fb = SkScalarDiv(sump[2], weight);
194 SkScalar fa = SkScalarDiv(sump[3], weight);
195 int a = SkClampMax(SkScalarRoundToInt(fa), 255);
196 int r = SkClampMax(SkScalarRoundToInt(fr), a);
197 int g = SkClampMax(SkScalarRoundToInt(fg), a);
198 int b = SkClampMax(SkScalarRoundToInt(fb), a);
199
200 *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
201 }
202 }
203 }
204
205 static void upScaleHorizTranspose_SSE2(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
206 for (int y = 0 ; y < dst->height() ; y++) {
207 for (int x = 0 ; x < dst->width() ; x++) {
208 float sx = (y + 0.5f) / scale - 0.5f;
209 int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->widt h()-1);
210 int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->wid th()-1);
211
212 SkScalar totalWeight = 0;
213 SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
214
215 for (int srcX = x0 ; srcX <= x1 ; srcX++) {
216 SkScalar weight = filter->lookupScalar(sx - srcX);
217 SkPMColor c = *src->getAddr32(srcX, x);
218 fr += SkScalarMul(weight,SkGetPackedR32(c));
219 fg += SkScalarMul(weight,SkGetPackedG32(c));
220 fb += SkScalarMul(weight,SkGetPackedB32(c));
221 fa += SkScalarMul(weight,SkGetPackedA32(c));
222 totalWeight += weight;
223 }
224 fr = SkScalarDiv(fr,totalWeight);
225 fg = SkScalarDiv(fg,totalWeight);
226 fb = SkScalarDiv(fb,totalWeight);
227 fa = SkScalarDiv(fa,totalWeight);
228
229 int a = SkClampMax(SkScalarRoundToInt(fa), 255);
230 int r = SkClampMax(SkScalarRoundToInt(fr), a);
231 int g = SkClampMax(SkScalarRoundToInt(fg), a);
232 int b = SkClampMax(SkScalarRoundToInt(fb), a);
233
234 *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
235 }
236 }
237 }
238
239 static void downScaleHorizTranspose_SSE2(const SkBitmap *src, SkBitmap *dst, flo at scale, SkBitmapFilter *filter) {
240 SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * src->height() * 4);
241 SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * src->height());
242
243 SkAutoTDeleteArray<SkScalar> ada1(sums);
244 SkAutoTDeleteArray<SkScalar> ada2(weights);
245
246 memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
247 memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
248
249 for (int y = 0 ; y < src->height() ; y++) {
250 for (int x = 0 ; x < src->width() ; x++) {
251 // splat each source pixel into the destination image
252 float dx = (x + 0.5f) * scale - 0.5f;
253 int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->heig ht()-1);
254 int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->hei ght()-1);
255
256 SkPMColor c = *src->getAddr32(x,y);
257
258 for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {
259 SkScalar weight = filter->lookup(dx - dst_x);
260 SkScalar *sump = sums + 4*(dst_x*dst->width() + y);
261
262 sump[0] += weight*SkGetPackedR32(c);
263 sump[1] += weight*SkGetPackedG32(c);
264 sump[2] += weight*SkGetPackedB32(c);
265 sump[3] += weight*SkGetPackedA32(c);
266 weights[dst_x*dst->width() + y] += weight;
267 }
268 }
269 }
270
271 divideByWeights_SSE2(sums, weights, dst);
272 }
273
274 void highQualityScale_SSE2( const SkBitmap *src, SkBitmap *dst ) {
275 SkBitmap horizTemp;
276
277 horizTemp.setConfig(SkBitmap::kARGB_8888_Config, src->height(), dst->width() );
278 horizTemp.allocPixels();
279
280 SkBitmapFilter *filter = SkBitmapFilter::allocate();
281
282 float horizScale = float(dst->width()) / src->width();
283
284 if (horizScale >= 1) {
285 upScaleHorizTranspose_SSE2(src, &horizTemp, horizScale, filter);
286 } else if (horizScale < 1) {
287 downScaleHorizTranspose_SSE2(src, &horizTemp, horizScale, filter);
288 }
289
290 float vertScale = float(dst->height()) / src->height();
291
292 if (vertScale >= 1) {
293 upScaleHorizTranspose_SSE2(&horizTemp, dst, vertScale, filter);
294 } else if (vertScale < 1) {
295 downScaleHorizTranspose_SSE2(&horizTemp, dst, vertScale, filter);
296 }
297
298 SkDELETE(filter);
299 }
300
301 // Convolves horizontally along a single row. The row data is given in
302 // |src_data| and continues for the num_values() of the filter.
303 void convolveHorizontally_SSE2(const unsigned char* src_data,
304 const SkConvolutionFilter1D& filter,
305 unsigned char* out_row,
306 bool /*has_alpha*/) {
307 int num_values = filter.numValues();
308
309 int filter_offset, filter_length;
310 __m128i zero = _mm_setzero_si128();
311 __m128i mask[4];
312 // |mask| will be used to decimate all extra filter coefficients that are
313 // loaded by SIMD when |filter_length| is not divisible by 4.
314 // mask[0] is not used in following algorithm.
315 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
316 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
317 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
318
319 // Output one pixel each iteration, calculating all channels (RGBA) together.
320 for (int out_x = 0; out_x < num_values; out_x++) {
321 const SkConvolutionFilter1D::Fixed* filter_values =
322 filter.FilterForValue(out_x, &filter_offset, &filter_length);
323
324 __m128i accum = _mm_setzero_si128();
325
326 // Compute the first pixel in this row that the filter affects. It will
327 // touch |filter_length| pixels (4 bytes each) after this.
328 const __m128i* row_to_filter =
329 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
330
331 // We will load and accumulate with four coefficients per iteration.
332 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
333
334 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
335 __m128i coeff, coeff16;
336 // [16] xx xx xx xx c3 c2 c1 c0
337 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
338 // [16] xx xx xx xx c1 c1 c0 c0
339 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
340 // [16] c1 c1 c1 c1 c0 c0 c0 c0
341 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
342
343 // Load four pixels => unpack the first two pixels to 16 bits =>
344 // multiply with coefficients => accumulate the convolution result.
345 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
346 __m128i src8 = _mm_loadu_si128(row_to_filter);
347 // [16] a1 b1 g1 r1 a0 b0 g0 r0
348 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
349 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
350 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
351 // [32] a0*c0 b0*c0 g0*c0 r0*c0
352 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
353 accum = _mm_add_epi32(accum, t);
354 // [32] a1*c1 b1*c1 g1*c1 r1*c1
355 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
356 accum = _mm_add_epi32(accum, t);
357
358 // Duplicate 3rd and 4th coefficients for all channels =>
359 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
360 // => accumulate the convolution results.
361 // [16] xx xx xx xx c3 c3 c2 c2
362 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
363 // [16] c3 c3 c3 c3 c2 c2 c2 c2
364 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
365 // [16] a3 g3 b3 r3 a2 g2 b2 r2
366 src16 = _mm_unpackhi_epi8(src8, zero);
367 mul_hi = _mm_mulhi_epi16(src16, coeff16);
368 mul_lo = _mm_mullo_epi16(src16, coeff16);
369 // [32] a2*c2 b2*c2 g2*c2 r2*c2
370 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
371 accum = _mm_add_epi32(accum, t);
372 // [32] a3*c3 b3*c3 g3*c3 r3*c3
373 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
374 accum = _mm_add_epi32(accum, t);
375
376 // Advance the pixel and coefficients pointers.
377 row_to_filter += 1;
378 filter_values += 4;
379 }
380
381 // When |filter_length| is not divisible by 4, we need to decimate some of
382 // the filter coefficient that was loaded incorrectly to zero; Other than
383 // that the algorithm is same with above, exceot that the 4th pixel will be
384 // always absent.
385 int r = filter_length&3;
386 if (r) {
387 // Note: filter_values must be padded to align_up(filter_offset, 8).
388 __m128i coeff, coeff16;
389 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
390 // Mask out extra filter taps.
391 coeff = _mm_and_si128(coeff, mask[r]);
392 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
393 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
394
395 // Note: line buffer must be padded to align_up(filter_offset, 16).
396 // We resolve this by use C-version for the last horizontal line.
397 __m128i src8 = _mm_loadu_si128(row_to_filter);
398 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
399 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
400 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
401 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
402 accum = _mm_add_epi32(accum, t);
403 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
404 accum = _mm_add_epi32(accum, t);
405
406 src16 = _mm_unpackhi_epi8(src8, zero);
407 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
408 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
409 mul_hi = _mm_mulhi_epi16(src16, coeff16);
410 mul_lo = _mm_mullo_epi16(src16, coeff16);
411 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
412 accum = _mm_add_epi32(accum, t);
413 }
414
415 // Shift right for fixed point implementation.
416 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
417
418 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
419 accum = _mm_packs_epi32(accum, zero);
420 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
421 accum = _mm_packus_epi16(accum, zero);
422
423 // Store the pixel value of 32 bits.
424 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
425 out_row += 4;
426 }
427 }
428
429 // Convolves horizontally along four rows. The row data is given in
430 // |src_data| and continues for the num_values() of the filter.
431 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
432 // refer to that function for detailed comments.
433 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
434 const SkConvolutionFilter1D& filter,
435 unsigned char* out_row[4]) {
436 int num_values = filter.numValues();
437
438 int filter_offset, filter_length;
439 __m128i zero = _mm_setzero_si128();
440 __m128i mask[4];
441 // |mask| will be used to decimate all extra filter coefficients that are
442 // loaded by SIMD when |filter_length| is not divisible by 4.
443 // mask[0] is not used in following algorithm.
444 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
445 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
446 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
447
448 // Output one pixel each iteration, calculating all channels (RGBA) together.
449 for (int out_x = 0; out_x < num_values; out_x++) {
450 const SkConvolutionFilter1D::Fixed* filter_values =
451 filter.FilterForValue(out_x, &filter_offset, &filter_length);
452
453 // four pixels in a column per iteration.
454 __m128i accum0 = _mm_setzero_si128();
455 __m128i accum1 = _mm_setzero_si128();
456 __m128i accum2 = _mm_setzero_si128();
457 __m128i accum3 = _mm_setzero_si128();
458 int start = (filter_offset<<2);
459 // We will load and accumulate with four coefficients per iteration.
460 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
461 __m128i coeff, coeff16lo, coeff16hi;
462 // [16] xx xx xx xx c3 c2 c1 c0
463 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
464 // [16] xx xx xx xx c1 c1 c0 c0
465 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
466 // [16] c1 c1 c1 c1 c0 c0 c0 c0
467 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
468 // [16] xx xx xx xx c3 c3 c2 c2
469 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
470 // [16] c3 c3 c3 c3 c2 c2 c2 c2
471 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
472
473 __m128i src8, src16, mul_hi, mul_lo, t;
474
475 #define ITERATION(src, accum) \
476 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
477 src16 = _mm_unpacklo_epi8(src8, zero); \
478 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
479 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
480 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
481 accum = _mm_add_epi32(accum, t); \
482 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
483 accum = _mm_add_epi32(accum, t); \
484 src16 = _mm_unpackhi_epi8(src8, zero); \
485 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
486 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
487 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
488 accum = _mm_add_epi32(accum, t); \
489 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
490 accum = _mm_add_epi32(accum, t)
491
492 ITERATION(src_data[0] + start, accum0);
493 ITERATION(src_data[1] + start, accum1);
494 ITERATION(src_data[2] + start, accum2);
495 ITERATION(src_data[3] + start, accum3);
496
497 start += 16;
498 filter_values += 4;
499 }
500
501 int r = filter_length & 3;
502 if (r) {
503 // Note: filter_values must be padded to align_up(filter_offset, 8);
504 __m128i coeff;
505 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
506 // Mask out extra filter taps.
507 coeff = _mm_and_si128(coeff, mask[r]);
508
509 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
510 /* c1 c1 c1 c1 c0 c0 c0 c0 */
511 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
512 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
513 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
514
515 __m128i src8, src16, mul_hi, mul_lo, t;
516
517 ITERATION(src_data[0] + start, accum0);
518 ITERATION(src_data[1] + start, accum1);
519 ITERATION(src_data[2] + start, accum2);
520 ITERATION(src_data[3] + start, accum3);
521 }
522
523 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
524 accum0 = _mm_packs_epi32(accum0, zero);
525 accum0 = _mm_packus_epi16(accum0, zero);
526 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
527 accum1 = _mm_packs_epi32(accum1, zero);
528 accum1 = _mm_packus_epi16(accum1, zero);
529 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
530 accum2 = _mm_packs_epi32(accum2, zero);
531 accum2 = _mm_packus_epi16(accum2, zero);
532 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
533 accum3 = _mm_packs_epi32(accum3, zero);
534 accum3 = _mm_packus_epi16(accum3, zero);
535
536 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
537 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
538 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
539 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
540
541 out_row[0] += 4;
542 out_row[1] += 4;
543 out_row[2] += 4;
544 out_row[3] += 4;
545 }
546 }
547
548 // Does vertical convolution to produce one output row. The filter values and
549 // length are given in the first two parameters. These are applied to each
550 // of the rows pointed to in the |source_data_rows| array, with each row
551 // being |pixel_width| wide.
552 //
553 // The output must have room for |pixel_width * 4| bytes.
554 template<bool has_alpha>
555 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values,
556 int filter_length,
557 unsigned char* const* source_data_rows,
558 int pixel_width,
559 unsigned char* out_row) {
560 int width = pixel_width & ~3;
561
562 __m128i zero = _mm_setzero_si128();
563 __m128i accum0, accum1, accum2, accum3, coeff16;
564 const __m128i* src;
565 // Output four pixels per iteration (16 bytes).
566 for (int out_x = 0; out_x < width; out_x += 4) {
567
568 // Accumulated result for each pixel. 32 bits per RGBA channel.
569 accum0 = _mm_setzero_si128();
570 accum1 = _mm_setzero_si128();
571 accum2 = _mm_setzero_si128();
572 accum3 = _mm_setzero_si128();
573
574 // Convolve with one filter coefficient per iteration.
575 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
576
577 // Duplicate the filter coefficient 8 times.
578 // [16] cj cj cj cj cj cj cj cj
579 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
580
581 // Load four pixels (16 bytes) together.
582 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
583 src = reinterpret_cast<const __m128i*>(
584 &source_data_rows[filter_y][out_x << 2]);
585 __m128i src8 = _mm_loadu_si128(src);
586
587 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
588 // multiply with current coefficient => accumulate the result.
589 // [16] a1 b1 g1 r1 a0 b0 g0 r0
590 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
591 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
592 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
593 // [32] a0 b0 g0 r0
594 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
595 accum0 = _mm_add_epi32(accum0, t);
596 // [32] a1 b1 g1 r1
597 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
598 accum1 = _mm_add_epi32(accum1, t);
599
600 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
601 // multiply with current coefficient => accumulate the result.
602 // [16] a3 b3 g3 r3 a2 b2 g2 r2
603 src16 = _mm_unpackhi_epi8(src8, zero);
604 mul_hi = _mm_mulhi_epi16(src16, coeff16);
605 mul_lo = _mm_mullo_epi16(src16, coeff16);
606 // [32] a2 b2 g2 r2
607 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
608 accum2 = _mm_add_epi32(accum2, t);
609 // [32] a3 b3 g3 r3
610 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
611 accum3 = _mm_add_epi32(accum3, t);
612 }
613
614 // Shift right for fixed point implementation.
615 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
616 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
617 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
618 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
619
620 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
621 // [16] a1 b1 g1 r1 a0 b0 g0 r0
622 accum0 = _mm_packs_epi32(accum0, accum1);
623 // [16] a3 b3 g3 r3 a2 b2 g2 r2
624 accum2 = _mm_packs_epi32(accum2, accum3);
625
626 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
627 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
628 accum0 = _mm_packus_epi16(accum0, accum2);
629
630 if (has_alpha) {
631 // Compute the max(ri, gi, bi) for each pixel.
632 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
633 __m128i a = _mm_srli_epi32(accum0, 8);
634 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
635 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
636 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
637 a = _mm_srli_epi32(accum0, 16);
638 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
639 b = _mm_max_epu8(a, b); // Max of r and g and b.
640 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
641 b = _mm_slli_epi32(b, 24);
642
643 // Make sure the value of alpha channel is always larger than maximum
644 // value of color channels.
645 accum0 = _mm_max_epu8(b, accum0);
646 } else {
647 // Set value of alpha channels to 0xFF.
648 __m128i mask = _mm_set1_epi32(0xff000000);
649 accum0 = _mm_or_si128(accum0, mask);
650 }
651
652 // Store the convolution result (16 bytes) and advance the pixel pointers.
653 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
654 out_row += 16;
655 }
656
657 // When the width of the output is not divisible by 4, We need to save one
658 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
659 if (pixel_width & 3) {
660 accum0 = _mm_setzero_si128();
661 accum1 = _mm_setzero_si128();
662 accum2 = _mm_setzero_si128();
663 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
664 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
665 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
666 src = reinterpret_cast<const __m128i*>(
667 &source_data_rows[filter_y][width<<2]);
668 __m128i src8 = _mm_loadu_si128(src);
669 // [16] a1 b1 g1 r1 a0 b0 g0 r0
670 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
671 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
672 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
673 // [32] a0 b0 g0 r0
674 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
675 accum0 = _mm_add_epi32(accum0, t);
676 // [32] a1 b1 g1 r1
677 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
678 accum1 = _mm_add_epi32(accum1, t);
679 // [16] a3 b3 g3 r3 a2 b2 g2 r2
680 src16 = _mm_unpackhi_epi8(src8, zero);
681 mul_hi = _mm_mulhi_epi16(src16, coeff16);
682 mul_lo = _mm_mullo_epi16(src16, coeff16);
683 // [32] a2 b2 g2 r2
684 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
685 accum2 = _mm_add_epi32(accum2, t);
686 }
687
688 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
689 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
690 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
691 // [16] a1 b1 g1 r1 a0 b0 g0 r0
692 accum0 = _mm_packs_epi32(accum0, accum1);
693 // [16] a3 b3 g3 r3 a2 b2 g2 r2
694 accum2 = _mm_packs_epi32(accum2, zero);
695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
696 accum0 = _mm_packus_epi16(accum0, accum2);
697 if (has_alpha) {
698 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
699 __m128i a = _mm_srli_epi32(accum0, 8);
700 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
701 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
702 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
703 a = _mm_srli_epi32(accum0, 16);
704 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
705 b = _mm_max_epu8(a, b); // Max of r and g and b.
706 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
707 b = _mm_slli_epi32(b, 24);
708 accum0 = _mm_max_epu8(b, accum0);
709 } else {
710 __m128i mask = _mm_set1_epi32(0xff000000);
711 accum0 = _mm_or_si128(accum0, mask);
712 }
713
714 for (int out_x = width; out_x < pixel_width; out_x++) {
715 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
716 accum0 = _mm_srli_si128(accum0, 4);
717 out_row += 4;
718 }
719 }
720 }
721
722 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values,
723 int filter_length,
724 unsigned char* const* source_data_rows,
725 int pixel_width,
726 unsigned char* out_row,
727 bool has_alpha) {
728 if (has_alpha) {
729 convolveVertically_SSE2<true>(filter_values,
730 filter_length,
731 source_data_rows,
732 pixel_width,
733 out_row);
734 } else {
735 convolveVertically_SSE2<false>(filter_values,
736 filter_length,
737 source_data_rows,
738 pixel_width,
739 out_row);
740 }
741 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698