src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 19335002: Production quality fast image up/downsampler

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 19335002: Production quality fast image up/downsampler (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Changes from mike to remove dependencies on std C++ library Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 Google Inc.	2 * Copyright 2013 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBitmapProcState.h"	8 #include "SkBitmapProcState.h"

9 #include "SkBitmap.h"	9 #include "SkBitmap.h"

10 #include "SkColor.h"	10 #include "SkColor.h"

11 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

12 #include "SkUnPreMultiply.h"	12 #include "SkUnPreMultiply.h"

13 #include "SkShader.h"	13 #include "SkShader.h"

	14 #include "SkConvolver.h"

14	15

15 #include "SkBitmapFilter_opts_SSE2.h"	16 #include "SkBitmapFilter_opts_SSE2.h"

16	17

17 #include <emmintrin.h>	18 #include <emmintrin.h>

18	19

19 #if 0	20 #if 0

20 static inline void print128i(__m128i value) {	21 static inline void print128i(__m128i value) {

21 int v = (int) &value;	22 int v = (int) &value;

22 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);	23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);

23 }	24 }

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
173	174

174 *colors++ = SkPackARGB32(a, r, g, b);	175 *colors++ = SkPackARGB32(a, r, g, b);

175	176

176 x++;	177 x++;

177	178

178 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),	179 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),

179 SkIntToScalar(y), &srcPt);	180 SkIntToScalar(y), &srcPt);

180	181

181 }	182 }

182 }	183 }

	184

	185 static void divideByWeights_SSE2(SkScalar sums, SkScalar weights, SkBitmap *ds t) {
	Stephen White 2013/07/18 17:39:45 Doesn't look like SSE2 code; maybe it should just Doesn't look like SSE2 code; maybe it should just be divideByWeights(), since it's file-static? Or is this going to be SSE2 eventually? humper 2013/07/18 18:15:46 Yikes, these functions are dead anyway; deleting. Show quoted text On 2013/07/18 17:39:45, Stephen White wrote: > Doesn't look like SSE2 code; maybe it should just be divideByWeights(), since > it's file-static? Or is this going to be SSE2 eventually? Yikes, these functions are dead anyway; deleting.
	186 for (int y = 0 ; y < dst->height() ; y++) {

	187 for (int x = 0 ; x < dst->width() ; x++) {

	188 SkScalar sump = sums + 4(y*dst->width() + x);

	189 SkScalar weight = weights[y*dst->width() + x];

	190

	191 SkScalar fr = SkScalarDiv(sump[0], weight);

	192 SkScalar fg = SkScalarDiv(sump[1], weight);

	193 SkScalar fb = SkScalarDiv(sump[2], weight);

	194 SkScalar fa = SkScalarDiv(sump[3], weight);

	195 int a = SkClampMax(SkScalarRoundToInt(fa), 255);

	196 int r = SkClampMax(SkScalarRoundToInt(fr), a);

	197 int g = SkClampMax(SkScalarRoundToInt(fg), a);

	198 int b = SkClampMax(SkScalarRoundToInt(fb), a);

	199

	200 *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);

	201 }

	202 }

	203 }

	204

	205 static void upScaleHorizTranspose_SSE2(const SkBitmap src, SkBitmap dst, float scale, SkBitmapFilter *filter) {
	Stephen White 2013/07/18 17:39:45 Same here. Same here.
	206 for (int y = 0 ; y < dst->height() ; y++) {

	207 for (int x = 0 ; x < dst->width() ; x++) {

	208 float sx = (y + 0.5f) / scale - 0.5f;

	209 int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->widt h()-1);

	210 int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->wid th()-1);

	211

	212 SkScalar totalWeight = 0;

	213 SkScalar fr = 0, fg = 0, fb = 0, fa = 0;

	214

	215 for (int srcX = x0 ; srcX <= x1 ; srcX++) {

	216 SkScalar weight = filter->lookupScalar(sx - srcX);

	217 SkPMColor c = *src->getAddr32(srcX, x);

	218 fr += SkScalarMul(weight,SkGetPackedR32(c));

	219 fg += SkScalarMul(weight,SkGetPackedG32(c));

	220 fb += SkScalarMul(weight,SkGetPackedB32(c));

	221 fa += SkScalarMul(weight,SkGetPackedA32(c));

	222 totalWeight += weight;

	223 }

	224 fr = SkScalarDiv(fr,totalWeight);

	225 fg = SkScalarDiv(fg,totalWeight);

	226 fb = SkScalarDiv(fb,totalWeight);

	227 fa = SkScalarDiv(fa,totalWeight);

	228

	229 int a = SkClampMax(SkScalarRoundToInt(fa), 255);

	230 int r = SkClampMax(SkScalarRoundToInt(fr), a);

	231 int g = SkClampMax(SkScalarRoundToInt(fg), a);

	232 int b = SkClampMax(SkScalarRoundToInt(fb), a);

	233

	234 *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);

	235 }

	236 }

	237 }

	238

	239 static void downScaleHorizTranspose_SSE2(const SkBitmap src, SkBitmap dst, flo at scale, SkBitmapFilter *filter) {

	240 SkScalar sums = SkNEW_ARRAY(SkScalar, dst->width() src->height() * 4);

	241 SkScalar weights = SkNEW_ARRAY(SkScalar, dst->width() src->height());

	242

	243 SkAutoTDeleteArray<SkScalar> ada1(sums);

	244 SkAutoTDeleteArray<SkScalar> ada2(weights);

	245

	246 memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);

	247 memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));

	248

	249 for (int y = 0 ; y < src->height() ; y++) {

	250 for (int x = 0 ; x < src->width() ; x++) {

	251 // splat each source pixel into the destination image

	252 float dx = (x + 0.5f) * scale - 0.5f;

	253 int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->heig ht()-1);

	254 int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->hei ght()-1);

	255

	256 SkPMColor c = *src->getAddr32(x,y);

	257

	258 for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {

	259 SkScalar weight = filter->lookup(dx - dst_x);

	260 SkScalar sump = sums + 4(dst_x*dst->width() + y);

	261

	262 sump[0] += weight*SkGetPackedR32(c);

	263 sump[1] += weight*SkGetPackedG32(c);

	264 sump[2] += weight*SkGetPackedB32(c);

	265 sump[3] += weight*SkGetPackedA32(c);

	266 weights[dst_x*dst->width() + y] += weight;

	267 }

	268 }

	269 }

	270

	271 divideByWeights_SSE2(sums, weights, dst);

	272 }

	273

	274 void highQualityScale_SSE2( const SkBitmap src, SkBitmap dst ) {

	275 SkBitmap horizTemp;

	276

	277 horizTemp.setConfig(SkBitmap::kARGB_8888_Config, src->height(), dst->width() );

	278 horizTemp.allocPixels();

	279

	280 SkBitmapFilter *filter = SkBitmapFilter::allocate();

	281

	282 float horizScale = float(dst->width()) / src->width();

	283

	284 if (horizScale >= 1) {

	285 upScaleHorizTranspose_SSE2(src, &horizTemp, horizScale, filter);

	286 } else if (horizScale < 1) {

	287 downScaleHorizTranspose_SSE2(src, &horizTemp, horizScale, filter);

	288 }

	289

	290 float vertScale = float(dst->height()) / src->height();

	291

	292 if (vertScale >= 1) {

	293 upScaleHorizTranspose_SSE2(&horizTemp, dst, vertScale, filter);

	294 } else if (vertScale < 1) {

	295 downScaleHorizTranspose_SSE2(&horizTemp, dst, vertScale, filter);

	296 }

	297

	298 SkDELETE(filter);

	299 }

	300

	301 // Convolves horizontally along a single row. The row data is given in

	302 // \|src_data\| and continues for the num_values() of the filter.

	303 void convolveHorizontally_SSE2(const unsigned char* src_data,

	304 const SkConvolutionFilter1D& filter,

	305 unsigned char* out_row,

	306 bool /has_alpha/) {

	307 int num_values = filter.numValues();

	308

	309 int filter_offset, filter_length;

	310 __m128i zero = _mm_setzero_si128();

	311 __m128i mask[4];

	312 // \|mask\| will be used to decimate all extra filter coefficients that are

	313 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	314 // mask[0] is not used in following algorithm.

	315 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	316 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	317 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	318

	319 // Output one pixel each iteration, calculating all channels (RGBA) together.

	320 for (int out_x = 0; out_x < num_values; out_x++) {

	321 const SkConvolutionFilter1D::Fixed* filter_values =

	322 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	323

	324 __m128i accum = _mm_setzero_si128();

	325

	326 // Compute the first pixel in this row that the filter affects. It will

	327 // touch \|filter_length\| pixels (4 bytes each) after this.

	328 const __m128i* row_to_filter =

	329 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

	330

	331 // We will load and accumulate with four coefficients per iteration.

	332 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {

	333

	334 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	335 __m128i coeff, coeff16;

	336 // [16] xx xx xx xx c3 c2 c1 c0

	337 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	338 // [16] xx xx xx xx c1 c1 c0 c0

	339 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	340 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	341 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	342

	343 // Load four pixels => unpack the first two pixels to 16 bits =>

	344 // multiply with coefficients => accumulate the convolution result.

	345 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	346 __m128i src8 = _mm_loadu_si128(row_to_filter);

	347 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	348 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	349 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	350 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	351 // [32] a0c0 b0c0 g0c0 r0c0

	352 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	353 accum = _mm_add_epi32(accum, t);

	354 // [32] a1c1 b1c1 g1c1 r1c1

	355 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	356 accum = _mm_add_epi32(accum, t);

	357

	358 // Duplicate 3rd and 4th coefficients for all channels =>

	359 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients

	360 // => accumulate the convolution results.

	361 // [16] xx xx xx xx c3 c3 c2 c2

	362 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	363 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	364 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	365 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	366 src16 = _mm_unpackhi_epi8(src8, zero);

	367 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	368 mul_lo = _mm_mullo_epi16(src16, coeff16);

	369 // [32] a2c2 b2c2 g2c2 r2c2

	370 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	371 accum = _mm_add_epi32(accum, t);

	372 // [32] a3c3 b3c3 g3c3 r3c3

	373 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	374 accum = _mm_add_epi32(accum, t);

	375

	376 // Advance the pixel and coefficients pointers.

	377 row_to_filter += 1;

	378 filter_values += 4;

	379 }

	380

	381 // When \|filter_length\| is not divisible by 4, we need to decimate some of

	382 // the filter coefficient that was loaded incorrectly to zero; Other than

	383 // that the algorithm is same with above, exceot that the 4th pixel will be

	384 // always absent.

	385 int r = filter_length&3;

	386 if (r) {

	387 // Note: filter_values must be padded to align_up(filter_offset, 8).

	388 __m128i coeff, coeff16;

	389 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	390 // Mask out extra filter taps.

	391 coeff = _mm_and_si128(coeff, mask[r]);

	392 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	393 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	394

	395 // Note: line buffer must be padded to align_up(filter_offset, 16).

	396 // We resolve this by use C-version for the last horizontal line.

	397 __m128i src8 = _mm_loadu_si128(row_to_filter);

	398 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	399 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	400 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	401 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	402 accum = _mm_add_epi32(accum, t);

	403 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	404 accum = _mm_add_epi32(accum, t);

	405

	406 src16 = _mm_unpackhi_epi8(src8, zero);

	407 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	408 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	409 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	410 mul_lo = _mm_mullo_epi16(src16, coeff16);

	411 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	412 accum = _mm_add_epi32(accum, t);

	413 }

	414

	415 // Shift right for fixed point implementation.

	416 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

	417

	418 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	419 accum = _mm_packs_epi32(accum, zero);

	420 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	421 accum = _mm_packus_epi16(accum, zero);

	422

	423 // Store the pixel value of 32 bits.

	424 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	425 out_row += 4;

	426 }

	427 }

	428

	429 // Convolves horizontally along four rows. The row data is given in

	430 // \|src_data\| and continues for the num_values() of the filter.

	431 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

	432 // refer to that function for detailed comments.

	433 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

	434 const SkConvolutionFilter1D& filter,

	435 unsigned char* out_row[4]) {

	436 int num_values = filter.numValues();

	437

	438 int filter_offset, filter_length;

	439 __m128i zero = _mm_setzero_si128();

	440 __m128i mask[4];

	441 // \|mask\| will be used to decimate all extra filter coefficients that are

	442 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	443 // mask[0] is not used in following algorithm.

	444 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	445 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	446 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	447

	448 // Output one pixel each iteration, calculating all channels (RGBA) together.

	449 for (int out_x = 0; out_x < num_values; out_x++) {

	450 const SkConvolutionFilter1D::Fixed* filter_values =

	451 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	452

	453 // four pixels in a column per iteration.

	454 __m128i accum0 = _mm_setzero_si128();

	455 __m128i accum1 = _mm_setzero_si128();

	456 __m128i accum2 = _mm_setzero_si128();

	457 __m128i accum3 = _mm_setzero_si128();

	458 int start = (filter_offset<<2);

	459 // We will load and accumulate with four coefficients per iteration.

	460 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {

	461 __m128i coeff, coeff16lo, coeff16hi;

	462 // [16] xx xx xx xx c3 c2 c1 c0

	463 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	464 // [16] xx xx xx xx c1 c1 c0 c0

	465 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	466 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	467 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	468 // [16] xx xx xx xx c3 c3 c2 c2

	469 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	470 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	471 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	472

	473 __m128i src8, src16, mul_hi, mul_lo, t;

	474

	475 #define ITERATION(src, accum) \

	476 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	477 src16 = _mm_unpacklo_epi8(src8, zero); \

	478 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	479 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	480 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	481 accum = _mm_add_epi32(accum, t); \

	482 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	483 accum = _mm_add_epi32(accum, t); \

	484 src16 = _mm_unpackhi_epi8(src8, zero); \

	485 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	486 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	487 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	488 accum = _mm_add_epi32(accum, t); \

	489 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	490 accum = _mm_add_epi32(accum, t)

	491

	492 ITERATION(src_data[0] + start, accum0);

	493 ITERATION(src_data[1] + start, accum1);

	494 ITERATION(src_data[2] + start, accum2);

	495 ITERATION(src_data[3] + start, accum3);

	496

	497 start += 16;

	498 filter_values += 4;

	499 }

	500

	501 int r = filter_length & 3;

	502 if (r) {

	503 // Note: filter_values must be padded to align_up(filter_offset, 8);

	504 __m128i coeff;

	505 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	506 // Mask out extra filter taps.

	507 coeff = _mm_and_si128(coeff, mask[r]);

	508

	509 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	510 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	511 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	512 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	513 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	514

	515 __m128i src8, src16, mul_hi, mul_lo, t;

	516

	517 ITERATION(src_data[0] + start, accum0);

	518 ITERATION(src_data[1] + start, accum1);

	519 ITERATION(src_data[2] + start, accum2);

	520 ITERATION(src_data[3] + start, accum3);

	521 }

	522

	523 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	524 accum0 = _mm_packs_epi32(accum0, zero);

	525 accum0 = _mm_packus_epi16(accum0, zero);

	526 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	527 accum1 = _mm_packs_epi32(accum1, zero);

	528 accum1 = _mm_packus_epi16(accum1, zero);

	529 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	530 accum2 = _mm_packs_epi32(accum2, zero);

	531 accum2 = _mm_packus_epi16(accum2, zero);

	532 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	533 accum3 = _mm_packs_epi32(accum3, zero);

	534 accum3 = _mm_packus_epi16(accum3, zero);

	535

	536 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	537 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	538 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	539 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	540

	541 out_row[0] += 4;

	542 out_row[1] += 4;

	543 out_row[2] += 4;

	544 out_row[3] += 4;

	545 }

	546 }

	547

	548 // Does vertical convolution to produce one output row. The filter values and

	549 // length are given in the first two parameters. These are applied to each

	550 // of the rows pointed to in the \|source_data_rows\| array, with each row

	551 // being \|pixel_width\| wide.

	552 //

	553 // The output must have room for \|pixel_width * 4\| bytes.

	554 template<bool has_alpha>

	555 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values,

	556 int filter_length,

	557 unsigned char* const* source_data_rows,

	558 int pixel_width,

	559 unsigned char* out_row) {

	560 int width = pixel_width & ~3;

	561

	562 __m128i zero = _mm_setzero_si128();

	563 __m128i accum0, accum1, accum2, accum3, coeff16;

	564 const __m128i* src;

	565 // Output four pixels per iteration (16 bytes).

	566 for (int out_x = 0; out_x < width; out_x += 4) {

	567

	568 // Accumulated result for each pixel. 32 bits per RGBA channel.

	569 accum0 = _mm_setzero_si128();

	570 accum1 = _mm_setzero_si128();

	571 accum2 = _mm_setzero_si128();

	572 accum3 = _mm_setzero_si128();

	573

	574 // Convolve with one filter coefficient per iteration.

	575 for (int filter_y = 0; filter_y < filter_length; filter_y++) {

	576

	577 // Duplicate the filter coefficient 8 times.

	578 // [16] cj cj cj cj cj cj cj cj

	579 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	580

	581 // Load four pixels (16 bytes) together.

	582 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	583 src = reinterpret_cast<const __m128i*>(

	584 &source_data_rows[filter_y][out_x << 2]);

	585 __m128i src8 = _mm_loadu_si128(src);

	586

	587 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>

	588 // multiply with current coefficient => accumulate the result.

	589 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	590 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	591 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	592 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	593 // [32] a0 b0 g0 r0

	594 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	595 accum0 = _mm_add_epi32(accum0, t);

	596 // [32] a1 b1 g1 r1

	597 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	598 accum1 = _mm_add_epi32(accum1, t);

	599

	600 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>

	601 // multiply with current coefficient => accumulate the result.

	602 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	603 src16 = _mm_unpackhi_epi8(src8, zero);

	604 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	605 mul_lo = _mm_mullo_epi16(src16, coeff16);

	606 // [32] a2 b2 g2 r2

	607 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	608 accum2 = _mm_add_epi32(accum2, t);

	609 // [32] a3 b3 g3 r3

	610 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	611 accum3 = _mm_add_epi32(accum3, t);

	612 }

	613

	614 // Shift right for fixed point implementation.

	615 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	616 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	617 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	618 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	619

	620 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	621 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	622 accum0 = _mm_packs_epi32(accum0, accum1);

	623 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	624 accum2 = _mm_packs_epi32(accum2, accum3);

	625

	626 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	627 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	628 accum0 = _mm_packus_epi16(accum0, accum2);

	629

	630 if (has_alpha) {

	631 // Compute the max(ri, gi, bi) for each pixel.

	632 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	633 __m128i a = _mm_srli_epi32(accum0, 8);

	634 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	635 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	636 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	637 a = _mm_srli_epi32(accum0, 16);

	638 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	639 b = _mm_max_epu8(a, b); // Max of r and g and b.

	640 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	641 b = _mm_slli_epi32(b, 24);

	642

	643 // Make sure the value of alpha channel is always larger than maximum

	644 // value of color channels.

	645 accum0 = _mm_max_epu8(b, accum0);

	646 } else {

	647 // Set value of alpha channels to 0xFF.

	648 __m128i mask = _mm_set1_epi32(0xff000000);

	649 accum0 = _mm_or_si128(accum0, mask);

	650 }

	651

	652 // Store the convolution result (16 bytes) and advance the pixel pointers.

	653 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	654 out_row += 16;

	655 }

	656

	657 // When the width of the output is not divisible by 4, We need to save one

	658 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

	659 if (pixel_width & 3) {

	660 accum0 = _mm_setzero_si128();

	661 accum1 = _mm_setzero_si128();

	662 accum2 = _mm_setzero_si128();

	663 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

	664 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	665 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	666 src = reinterpret_cast<const __m128i*>(

	667 &source_data_rows[filter_y][width<<2]);

	668 __m128i src8 = _mm_loadu_si128(src);

	669 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	670 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	671 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	672 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	673 // [32] a0 b0 g0 r0

	674 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	675 accum0 = _mm_add_epi32(accum0, t);

	676 // [32] a1 b1 g1 r1

	677 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	678 accum1 = _mm_add_epi32(accum1, t);

	679 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	680 src16 = _mm_unpackhi_epi8(src8, zero);

	681 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	682 mul_lo = _mm_mullo_epi16(src16, coeff16);

	683 // [32] a2 b2 g2 r2

	684 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	685 accum2 = _mm_add_epi32(accum2, t);

	686 }

	687

	688 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	689 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	690 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	691 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	692 accum0 = _mm_packs_epi32(accum0, accum1);

	693 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	694 accum2 = _mm_packs_epi32(accum2, zero);

	695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	696 accum0 = _mm_packus_epi16(accum0, accum2);

	697 if (has_alpha) {

	698 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	699 __m128i a = _mm_srli_epi32(accum0, 8);

	700 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	701 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	702 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	703 a = _mm_srli_epi32(accum0, 16);

	704 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	705 b = _mm_max_epu8(a, b); // Max of r and g and b.

	706 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	707 b = _mm_slli_epi32(b, 24);

	708 accum0 = _mm_max_epu8(b, accum0);

	709 } else {

	710 __m128i mask = _mm_set1_epi32(0xff000000);

	711 accum0 = _mm_or_si128(accum0, mask);

	712 }

	713

	714 for (int out_x = width; out_x < pixel_width; out_x++) {

	715 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	716 accum0 = _mm_srli_si128(accum0, 4);

	717 out_row += 4;

	718 }

	719 }

	720 }

	721

	722 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values,

	723 int filter_length,

	724 unsigned char* const* source_data_rows,

	725 int pixel_width,

	726 unsigned char* out_row,

	727 bool has_alpha) {

	728 if (has_alpha) {

	729 convolveVertically_SSE2<true>(filter_values,

	730 filter_length,

	731 source_data_rows,

	732 pixel_width,

	733 out_row);

	734 } else {

	735 convolveVertically_SSE2<false>(filter_values,

	736 filter_length,

	737 source_data_rows,

	738 pixel_width,

	739 out_row);

	740 }

	741 }

OLD	NEW

« src/core/SkBitmapFilter.cpp ('K') | « src/opts/SkBitmapFilter_opts_SSE2.h ('k') | src/opts/SkBitmapProcState_opts_none.cpp » ('j') | no next file with comments »