skia/ext/convolver.cc - Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc.

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: typo Created 9 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <algorithm>	5 #include <algorithm>

6	6

7 #include "skia/ext/convolver.h"	7 #include "skia/ext/convolver.h"

8 #include "third_party/skia/include/core/SkTypes.h"	8 #include "third_party/skia/include/core/SkTypes.h"

9	9

	10 #if defined(SIMD_SSE2)

	11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h

	12 #endif

	13

10 namespace skia {	14 namespace skia {

11	15

12 namespace {	16 namespace {

13	17

14 // Converts the argument to an 8-bit unsigned value by clamping to the range	18 // Converts the argument to an 8-bit unsigned value by clamping to the range

15 // 0-255.	19 // 0-255.

16 inline unsigned char ClampTo8(int a) {	20 inline unsigned char ClampTo8(int a) {

17 if (static_cast<unsigned>(a) < 256)	21 if (static_cast<unsigned>(a) < 256)

18 return a; // Avoid the extra check in the common case.	22 return a; // Avoid the extra check in the common case.

19 if (a < 0)	23 if (a < 0)

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 if (has_alpha)	196 if (has_alpha)

193 accum[3] >>= ConvolutionFilter1D::kShiftBits;	197 accum[3] >>= ConvolutionFilter1D::kShiftBits;

194	198

195 // Store the new pixel.	199 // Store the new pixel.

196 out_row[byte_offset + 0] = ClampTo8(accum[0]);	200 out_row[byte_offset + 0] = ClampTo8(accum[0]);

197 out_row[byte_offset + 1] = ClampTo8(accum[1]);	201 out_row[byte_offset + 1] = ClampTo8(accum[1]);

198 out_row[byte_offset + 2] = ClampTo8(accum[2]);	202 out_row[byte_offset + 2] = ClampTo8(accum[2]);

199 if (has_alpha) {	203 if (has_alpha) {

200 unsigned char alpha = ClampTo8(accum[3]);	204 unsigned char alpha = ClampTo8(accum[3]);

201	205

202 // Make sure the alpha channel doesn't come out larger than any of the	206 // Make sure the alpha channel doesn't come out smaller than any of the

203 // color channels. We use premultipled alpha channels, so this should	207 // color channels. We use premultipled alpha channels, so this should

204 // never happen, but rounding errors will cause this from time to time.	208 // never happen, but rounding errors will cause this from time to time.

205 // These "impossible" colors will cause overflows (and hence random pixel	209 // These "impossible" colors will cause overflows (and hence random pixel

206 // values) when the resulting bitmap is drawn to the screen.	210 // values) when the resulting bitmap is drawn to the screen.

207 //	211 //

208 // We only need to do this when generating the final output row (here).	212 // We only need to do this when generating the final output row (here).

209 int max_color_channel = std::max(out_row[byte_offset + 0],	213 int max_color_channel = std::max(out_row[byte_offset + 0],

210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));	214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));

211 if (alpha < max_color_channel)	215 if (alpha < max_color_channel)

212 out_row[byte_offset + 3] = max_color_channel;	216 out_row[byte_offset + 3] = max_color_channel;

213 else	217 else

214 out_row[byte_offset + 3] = alpha;	218 out_row[byte_offset + 3] = alpha;

215 } else {	219 } else {

216 // No alpha channel, the image is opaque.	220 // No alpha channel, the image is opaque.

217 out_row[byte_offset + 3] = 0xff;	221 out_row[byte_offset + 3] = 0xff;

218 }	222 }

219 }	223 }

220 }	224 }

221	225

	226

	227 // Convolves horizontally along a single row. The row data is given in

	228 // \|src_data\| and continues for the num_values() of the filter.

	229 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

	230 const ConvolutionFilter1D& filter,

	231 unsigned char* out_row) {

	232 #if defined(SIMD_SSE2)

	233 int num_values = filter.num_values();

	234

	235 int filter_offset, filter_length;

	236 __m128i zero = _mm_setzero_si128();

	237 __m128i mask[4];

	238 // \|mask\| will be used to decimate all extra filter coefficients that are

	239 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	240 // mask[0] is not used in following algorithm.

	241 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	242 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	243 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	244

	245 // Output one pixel each iteration, calculating all channels (RGBA) together.

	246 for (int out_x = 0; out_x < num_values; out_x++) {

	247 const ConvolutionFilter1D::Fixed* filter_values =

	248 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	249

	250 __m128i accum = _mm_setzero_si128();

	251

	252 // Compute the first pixel in this row that the filter affects. It will

	253 // touch \|filter_length\| pixels (4 bytes each) after this.

	254 const __m128i* row_to_filter =

	255 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

	256

	257 // We will load and accumulate with four coefficients per iteration.

	258 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {

	259

	260 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	261 __m128i coeff, coeff16;

	262 // [16] xx xx xx xx c3 c2 c1 c0

	263 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	264 // [16] xx xx xx xx c1 c1 c0 c0

	265 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	266 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	267 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	268

	269 // Load four pixels => unpack the first two pixels to 16 bits =>

	270 // multiply with coefficients => accumulate the convolution result.

	271 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	272 __m128i src8 = _mm_loadu_si128(row_to_filter);

	273 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	274 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	275 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	276 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	277 // [32] a0c0 b0c0 g0c0 r0c0

	278 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	279 accum = _mm_add_epi32(accum, t);

	280 // [32] a1c1 b1c1 g1c1 r1c1

	281 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	282 accum = _mm_add_epi32(accum, t);

	283

	284 // Duplicate 3rd and 4th coefficients for all channels =>

	285 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients

	286 // => accumulate the convolution results.

	287 // [16] xx xx xx xx c3 c3 c2 c2

	288 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	289 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	290 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	291 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	292 src16 = _mm_unpackhi_epi8(src8, zero);

	293 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	294 mul_lo = _mm_mullo_epi16(src16, coeff16);

	295 // [32] a2c2 b2c2 g2c2 r2c2

	296 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	297 accum = _mm_add_epi32(accum, t);

	298 // [32] a3c3 b3c3 g3c3 r3c3

	299 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	300 accum = _mm_add_epi32(accum, t);

	301

	302 // Advance the pixel and coefficients pointers.

	303 row_to_filter += 1;

	304 filter_values += 4;

	305 }

	306

	307 // When \|filter_length\| is not divisible by 4, we need to decimate some of

	308 // the filter coefficient that was loaded incorrectly to zero; Other than

	309 // that the algorithm is same with above, exceot that the 4th pixel will be

	310 // always absent.

	311 int r = filter_length&3;

	312 if (r) {

	313 // Note: filter_values must be padded to align_up(filter_offset, 8).

	314 __m128i coeff, coeff16;

	315 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	316 // Mask out extra filter taps.

	317 coeff = _mm_and_si128(coeff, mask[r]);

	318 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	319 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	320

	321 // Note: line buffer must be padded to align_up(filter_offset, 16).

	322 // We resolve this by use C-version for the last horizontal line.

	323 __m128i src8 = _mm_loadu_si128(row_to_filter);

	324 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	325 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	326 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	327 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	328 accum = _mm_add_epi32(accum, t);

	329 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	330 accum = _mm_add_epi32(accum, t);

	331

	332 src16 = _mm_unpackhi_epi8(src8, zero);

	333 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	334 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	335 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	336 mul_lo = _mm_mullo_epi16(src16, coeff16);

	337 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	338 accum = _mm_add_epi32(accum, t);

	339 }

	340

	341 // Shift right for fixed point implementation.

	342 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);

	343

	344 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	345 accum = _mm_packs_epi32(accum, zero);

	346 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	347 accum = _mm_packus_epi16(accum, zero);

	348

	349 // Store the pixel value of 32 bits.

	350 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	351 out_row += 4;

	352 }

	353 #endif

	354 }

	355

	356 // Convolves horizontally along four rows. The row data is given in

	357 // \|src_data\| and continues for the num_values() of the filter.

	358 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

	359 // refer to that function for detailed comments.

	360 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],

	361 const ConvolutionFilter1D& filter,

	362 unsigned char* out_row[4]) {

	363 #if defined(SIMD_SSE2)

	364 int num_values = filter.num_values();

	365

	366 int filter_offset, filter_length;

	367 __m128i zero = _mm_setzero_si128();

	368 __m128i mask[4];

	369 // \|mask\| will be used to decimate all extra filter coefficients that are

	370 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	371 // mask[0] is not used in following algorithm.

	372 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	373 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	374 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	375

	376 // Output one pixel each iteration, calculating all channels (RGBA) together.

	377 for (int out_x = 0; out_x < num_values; out_x++) {

	378 const ConvolutionFilter1D::Fixed* filter_values =

	379 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	380

	381 // four pixels in a column per iteration.

	382 __m128i accum0 = _mm_setzero_si128();

	383 __m128i accum1 = _mm_setzero_si128();

	384 __m128i accum2 = _mm_setzero_si128();

	385 __m128i accum3 = _mm_setzero_si128();

	386 int start = (filter_offset<<2);

	387 // We will load and accumulate with four coefficients per iteration.

	388 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {

	389 __m128i coeff, coeff16lo, coeff16hi;

	390 // [16] xx xx xx xx c3 c2 c1 c0

	391 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	392 // [16] xx xx xx xx c1 c1 c0 c0

	393 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	394 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	395 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	396 // [16] xx xx xx xx c3 c3 c2 c2

	397 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	398 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	399 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	400

	401 __m128i src8, src16, mul_hi, mul_lo, t;

	402

	403 #define ITERATION(src, accum) \

	404 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	405 src16 = _mm_unpacklo_epi8(src8, zero); \

	406 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	407 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	408 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	409 accum = _mm_add_epi32(accum, t); \

	410 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	411 accum = _mm_add_epi32(accum, t); \

	412 src16 = _mm_unpackhi_epi8(src8, zero); \

	413 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	414 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	415 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	416 accum = _mm_add_epi32(accum, t); \

	417 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	418 accum = _mm_add_epi32(accum, t)

	419

	420 ITERATION(src_data[0] + start, accum0);

	421 ITERATION(src_data[1] + start, accum1);

	422 ITERATION(src_data[2] + start, accum2);

	423 ITERATION(src_data[3] + start, accum3);

	424

	425 start += 16;

	426 filter_values += 4;

	427 }

	428

	429 int r = filter_length & 3;

	430 if (r) {

	431 // Note: filter_values must be padded to align_up(filter_offset, 8);

	432 __m128i coeff;

	433 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	434 // Mask out extra filter taps.

	435 coeff = _mm_and_si128(coeff, mask[r]);

	436

	437 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	438 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	439 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	440 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	441 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	442

	443 __m128i src8, src16, mul_hi, mul_lo, t;

	444

	445 ITERATION(src_data[0] + start, accum0);

	446 ITERATION(src_data[1] + start, accum1);

	447 ITERATION(src_data[2] + start, accum2);

	448 ITERATION(src_data[3] + start, accum3);

	449 }

	450

	451 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	452 accum0 = _mm_packs_epi32(accum0, zero);

	453 accum0 = _mm_packus_epi16(accum0, zero);

	454 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	455 accum1 = _mm_packs_epi32(accum1, zero);

	456 accum1 = _mm_packus_epi16(accum1, zero);

	457 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	458 accum2 = _mm_packs_epi32(accum2, zero);

	459 accum2 = _mm_packus_epi16(accum2, zero);

	460 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	461 accum3 = _mm_packs_epi32(accum3, zero);

	462 accum3 = _mm_packus_epi16(accum3, zero);

	463

	464 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	465 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	466 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	467 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	468

	469 out_row[0] += 4;

	470 out_row[1] += 4;

	471 out_row[2] += 4;

	472 out_row[3] += 4;

	473 }

	474 #endif

	475 }

	476

	477 // Does vertical convolution to produce one output row. The filter values and

	478 // length are given in the first two parameters. These are applied to each

	479 // of the rows pointed to in the \|source_data_rows\| array, with each row

	480 // being \|pixel_width\| wide.

	481 //

	482 // The output must have room for \|pixel_width * 4\| bytes.

	483 template<bool has_alpha>

	484 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	485 int filter_length,

	486 unsigned char* const* source_data_rows,

	487 int pixel_width,

	488 unsigned char* out_row) {

	489 #if defined(SIMD_SSE2)

	490 int width = pixel_width & ~3;

	491

	492 __m128i zero = _mm_setzero_si128();

	493 __m128i accum0, accum1, accum2, accum3, coeff16;

	494 const __m128i* src;

	495 // Output four pixels per iteration (16 bytes).

	496 for (int out_x = 0; out_x < width; out_x += 4) {

	497

	498 // Accumulated result for each pixel. 32 bits per RGBA channel.

	499 accum0 = _mm_setzero_si128();

	500 accum1 = _mm_setzero_si128();

	501 accum2 = _mm_setzero_si128();

	502 accum3 = _mm_setzero_si128();

	503

	504 // Convolve with one filter coefficient per iteration.

	505 for (int filter_y = 0; filter_y < filter_length; filter_y++) {

	506

	507 // Duplicate the filter coefficient 8 times.

	508 // [16] cj cj cj cj cj cj cj cj

	509 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	510

	511 // Load four pixels (16 bytes) together.

	512 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	513 src = reinterpret_cast<const __m128i*>(

	514 &source_data_rows[filter_y][out_x << 2]);

	515 __m128i src8 = _mm_loadu_si128(src);

	516

	517 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>

	518 // multiply with current coefficient => accumulate the result.

	519 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	520 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	521 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	522 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	523 // [32] a0 b0 g0 r0

	524 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	525 accum0 = _mm_add_epi32(accum0, t);

	526 // [32] a1 b1 g1 r1

	527 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	528 accum1 = _mm_add_epi32(accum1, t);

	529

	530 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>

	531 // multiply with current coefficient => accumulate the result.

	532 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	533 src16 = _mm_unpackhi_epi8(src8, zero);

	534 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	535 mul_lo = _mm_mullo_epi16(src16, coeff16);

	536 // [32] a2 b2 g2 r2

	537 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	538 accum2 = _mm_add_epi32(accum2, t);

	539 // [32] a3 b3 g3 r3

	540 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	541 accum3 = _mm_add_epi32(accum3, t);

	542 }

	543

	544 // Shift right for fixed point implementation.

	545 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	546 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	547 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	548 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	549

	550 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	551 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	552 accum0 = _mm_packs_epi32(accum0, accum1);

	553 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	554 accum2 = _mm_packs_epi32(accum2, accum3);

	555

	556 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	557 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	558 accum0 = _mm_packus_epi16(accum0, accum2);

	559

	560 if (has_alpha) {

	561 // Compute the max(ri, gi, bi) for each pixel.

	562 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	563 __m128i a = _mm_srli_epi32(accum0, 8);

	564 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	565 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	566 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	567 a = _mm_srli_epi32(accum0, 16);

	568 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	569 b = _mm_max_epu8(a, b); // Max of r and g and b.

	570 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	571 b = _mm_slli_epi32(b, 24);

	572

	573 // Make sure the value of alpha channel is always larger than maximum

	574 // value of color channels.

	575 accum0 = _mm_max_epu8(b, accum0);

	576 } else {

	577 // Set value of alpha channels to 0xFF.

	578 __m128i mask = _mm_set1_epi32(0xff000000);

	579 accum0 = _mm_or_si128(accum0, mask);

	580 }

	581

	582 // Store the convolution result (16 bytes) and advance the pixel pointers.

	583 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	584 out_row += 16;

	585 }

	586

	587 // When the width of the output is not divisible by 4, We need to save one

	588 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

	589 if (pixel_width & 3) {

	590 accum0 = _mm_setzero_si128();

	591 accum1 = _mm_setzero_si128();

	592 accum2 = _mm_setzero_si128();

	593 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

	594 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	595 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	596 src = reinterpret_cast<const __m128i*>(

	597 &source_data_rows[filter_y][width<<2]);

	598 __m128i src8 = _mm_loadu_si128(src);

	599 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	600 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	601 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	602 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	603 // [32] a0 b0 g0 r0

	604 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	605 accum0 = _mm_add_epi32(accum0, t);

	606 // [32] a1 b1 g1 r1

	607 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	608 accum1 = _mm_add_epi32(accum1, t);

	609 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	610 src16 = _mm_unpackhi_epi8(src8, zero);

	611 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	612 mul_lo = _mm_mullo_epi16(src16, coeff16);

	613 // [32] a2 b2 g2 r2

	614 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	615 accum2 = _mm_add_epi32(accum2, t);

	616 }

	617

	618 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	619 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	620 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	621 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	622 accum0 = _mm_packs_epi32(accum0, accum1);

	623 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	624 accum2 = _mm_packs_epi32(accum2, zero);

	625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	626 accum0 = _mm_packus_epi16(accum0, accum2);

	627 if (has_alpha) {

	628 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	629 __m128i a = _mm_srli_epi32(accum0, 8);

	630 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	631 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	632 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	633 a = _mm_srli_epi32(accum0, 16);

	634 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	635 b = _mm_max_epu8(a, b); // Max of r and g and b.

	636 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	637 b = _mm_slli_epi32(b, 24);

	638 accum0 = _mm_max_epu8(b, accum0);

	639 } else {

	640 __m128i mask = _mm_set1_epi32(0xff000000);

	641 accum0 = _mm_or_si128(accum0, mask);

	642 }

	643

	644 for (int out_x = width; out_x < pixel_width; out_x++) {

	645 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	646 accum0 = _mm_srli_si128(accum0, 4);

	647 out_row += 4;

	648 }

	649 }

	650 #endif

	651 }

	652

222 } // namespace	653 } // namespace

223	654

224 // ConvolutionFilter1D ---------------------------------------------------------	655 // ConvolutionFilter1D ---------------------------------------------------------

225	656

226 ConvolutionFilter1D::ConvolutionFilter1D()	657 ConvolutionFilter1D::ConvolutionFilter1D()

227 : max_filter_(0) {	658 : max_filter_(0) {

228 }	659 }

229	660

230 ConvolutionFilter1D::~ConvolutionFilter1D() {	661 ConvolutionFilter1D::~ConvolutionFilter1D() {

231 }	662 }

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
277 // We pushed filter_length elements onto filter_values_	708 // We pushed filter_length elements onto filter_values_

278 instance.data_location = (static_cast<int>(filter_values_.size()) -	709 instance.data_location = (static_cast<int>(filter_values_.size()) -

279 filter_length);	710 filter_length);

280 instance.offset = filter_offset;	711 instance.offset = filter_offset;

281 instance.length = filter_length;	712 instance.length = filter_length;

282 filters_.push_back(instance);	713 filters_.push_back(instance);

283	714

284 max_filter_ = std::max(max_filter_, filter_length);	715 max_filter_ = std::max(max_filter_, filter_length);

285 }	716 }

286	717

287 // BGRAConvolve2D -------------------------------------------------------------

288

289 void BGRAConvolve2D(const unsigned char* source_data,	718 void BGRAConvolve2D(const unsigned char* source_data,

290 int source_byte_row_stride,	719 int source_byte_row_stride,

291 bool source_has_alpha,	720 bool source_has_alpha,

292 const ConvolutionFilter1D& filter_x,	721 const ConvolutionFilter1D& filter_x,

293 const ConvolutionFilter1D& filter_y,	722 const ConvolutionFilter1D& filter_y,

294 int output_byte_row_stride,	723 int output_byte_row_stride,

295 unsigned char* output) {	724 unsigned char* output,

	725 bool use_sse2) {

	726 #if !defined(SIMD_SSE2)

	727 // Even we have runtime support for SSE2 instructions, since the binary

	728 // was not built with SSE2 support, we had to fallback to C version.

	729 use_sse2 = false;

	730 #endif

	731

296 int max_y_filter_size = filter_y.max_filter();	732 int max_y_filter_size = filter_y.max_filter();

297	733

298 // The next row in the input that we will generate a horizontally	734 // The next row in the input that we will generate a horizontally

299 // convolved row for. If the filter doesn't start at the beginning of the	735 // convolved row for. If the filter doesn't start at the beginning of the

300 // image (this is the case when we are only resizing a subset), then we	736 // image (this is the case when we are only resizing a subset), then we

301 // don't want to generate any output rows before that. Compute the starting	737 // don't want to generate any output rows before that. Compute the starting

302 // row for convolution as the first pixel for the first vertical filter.	738 // row for convolution as the first pixel for the first vertical filter.

303 int filter_offset, filter_length;	739 int filter_offset, filter_length;

304 const ConvolutionFilter1D::Fixed* filter_values =	740 const ConvolutionFilter1D::Fixed* filter_values =

305 filter_y.FilterForValue(0, &filter_offset, &filter_length);	741 filter_y.FilterForValue(0, &filter_offset, &filter_length);

306 int next_x_row = filter_offset;	742 int next_x_row = filter_offset;

307	743

308 // We loop over each row in the input doing a horizontal convolution. This	744 // We loop over each row in the input doing a horizontal convolution. This

309 // will result in a horizontally convolved image. We write the results into	745 // will result in a horizontally convolved image. We write the results into

310 // a circular buffer of convolved rows and do vertical convolution as rows	746 // a circular buffer of convolved rows and do vertical convolution as rows

311 // are available. This prevents us from having to store the entire	747 // are available. This prevents us from having to store the entire

312 // intermediate image and helps cache coherency.	748 // intermediate image and helps cache coherency.

313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,	749 // We will need four extra rows to allow horizontal convolution could be done

	750 // simultaneously. We also padding each row in row buffer to be aligned-up to

	751 // 16 bytes.

	752 // TODO(jiesun): We do not use aligned load from row buffer in vertical

	753 // convolution pass yet. Somehow Windows does not like it.

	754 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

	755 int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);

	756 CircularRowBuffer row_buffer(row_buffer_width,

	757 row_buffer_height,

314 filter_offset);	758 filter_offset);

315	759

316 // Loop over every possible output row, processing just enough horizontal	760 // Loop over every possible output row, processing just enough horizontal

317 // convolutions to run each subsequent vertical convolution.	761 // convolutions to run each subsequent vertical convolution.

318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);	762 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

319 int num_output_rows = filter_y.num_values();	763 int num_output_rows = filter_y.num_values();

	764

	765 // We need to check which is the last line to convolve before we advance 4

	766 // lines in one iteration.

	767 int last_filter_offset, last_filter_length;

	768 filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,

	769 &last_filter_length);

	770

320 for (int out_y = 0; out_y < num_output_rows; out_y++) {	771 for (int out_y = 0; out_y < num_output_rows; out_y++) {

321 filter_values = filter_y.FilterForValue(out_y,	772 filter_values = filter_y.FilterForValue(out_y,

322 &filter_offset, &filter_length);	773 &filter_offset, &filter_length);

323	774

324 // Generate output rows until we have enough to run the current filter.	775 // Generate output rows until we have enough to run the current filter.

325 while (next_x_row < filter_offset + filter_length) {	776 if (use_sse2) {

326 if (source_has_alpha) {	777 while (next_x_row < filter_offset + filter_length) {

327 ConvolveHorizontally<true>(	778 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {

328 &source_data[next_x_row * source_byte_row_stride],	779 const unsigned char* src[4];

329 filter_x, row_buffer.AdvanceRow());	780 unsigned char* out_row[4];

330 } else {	781 for (int i = 0; i < 4; ++i) {

331 ConvolveHorizontally<false>(	782 src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];

332 &source_data[next_x_row * source_byte_row_stride],	783 out_row[i] = row_buffer.AdvanceRow();

333 filter_x, row_buffer.AdvanceRow());	784 }

	785 ConvolveHorizontally4_SSE2(src, filter_x, out_row);

	786 next_x_row += 4;

	787 } else {

	788 // For the last row, SSE2 load possibly to access data beyond the

	789 // image area. therefore we use C version here.

	790 if (next_x_row == last_filter_offset + last_filter_length - 1) {

	791 if (source_has_alpha) {

	792 ConvolveHorizontally<true>(

	793 &source_data[next_x_row * source_byte_row_stride],

	794 filter_x, row_buffer.AdvanceRow());

	795 } else {

	796 ConvolveHorizontally<false>(

	797 &source_data[next_x_row * source_byte_row_stride],

	798 filter_x, row_buffer.AdvanceRow());

	799 }

	800 } else {

	801 ConvolveHorizontally_SSE2(

	802 &source_data[next_x_row * source_byte_row_stride],

	803 filter_x, row_buffer.AdvanceRow());

	804 }

	805 next_x_row++;

	806 }

334 }	807 }

335 next_x_row++;	808 } else {

	809 while (next_x_row < filter_offset + filter_length) {

	810 if (source_has_alpha) {

	811 ConvolveHorizontally<true>(

	812 &source_data[next_x_row * source_byte_row_stride],

	813 filter_x, row_buffer.AdvanceRow());

	814 } else {

	815 ConvolveHorizontally<false>(

	816 &source_data[next_x_row * source_byte_row_stride],

	817 filter_x, row_buffer.AdvanceRow());

	818 }

	819 next_x_row++;

	820 }

336 }	821 }

337	822

338 // Compute where in the output image this row of final data will go.	823 // Compute where in the output image this row of final data will go.

339 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];	824 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

340	825

341 // Get the list of rows that the circular buffer has, in order.	826 // Get the list of rows that the circular buffer has, in order.

342 int first_row_in_circular_buffer;	827 int first_row_in_circular_buffer;

343 unsigned char* const* rows_to_convolve =	828 unsigned char* const* rows_to_convolve =

344 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);	829 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

345	830

346 // Now compute the start of the subset of those rows that the filter	831 // Now compute the start of the subset of those rows that the filter

347 // needs.	832 // needs.

348 unsigned char* const* first_row_for_filter =	833 unsigned char* const* first_row_for_filter =

349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];	834 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

350	835

351 if (source_has_alpha) {	836 if (source_has_alpha) {

352 ConvolveVertically<true>(filter_values, filter_length,	837 if (use_sse2) {

353 first_row_for_filter,	838 ConvolveVertically_SSE2<true>(filter_values, filter_length,

354 filter_x.num_values(), cur_output_row);	839 first_row_for_filter,

	840 filter_x.num_values(), cur_output_row);

	841 } else {

	842 ConvolveVertically<true>(filter_values, filter_length,

	843 first_row_for_filter,

	844 filter_x.num_values(), cur_output_row);

	845 }

355 } else {	846 } else {

356 ConvolveVertically<false>(filter_values, filter_length,	847 if (use_sse2) {

357 first_row_for_filter,	848 ConvolveVertically_SSE2<false>(filter_values, filter_length,

358 filter_x.num_values(), cur_output_row);	849 first_row_for_filter,

	850 filter_x.num_values(), cur_output_row);

	851 } else {

	852 ConvolveVertically<false>(filter_values, filter_length,

	853 first_row_for_filter,

	854 filter_x.num_values(), cur_output_row);

	855 }

359 }	856 }

360 }	857 }

361 }	858 }

362	859

363 } // namespace skia	860 } // namespace skia

OLD	NEW

« no previous file with comments | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »