skia/ext/convolver.cc - Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc.

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: review issue/adding comments. Created 9 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <algorithm>	5 #include <algorithm>

6	6

7 #include "skia/ext/convolver.h"	7 #include "skia/ext/convolver.h"

8 #include "third_party/skia/include/core/SkTypes.h"	8 #include "third_party/skia/include/core/SkTypes.h"

9	9

	10 #if defined(ARCH_CPU_X86_FAMILY)

	11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h

	12 #endif

	13

10 namespace skia {	14 namespace skia {

11	15

12 namespace {	16 namespace {

13	17

14 // Converts the argument to an 8-bit unsigned value by clamping to the range	18 // Converts the argument to an 8-bit unsigned value by clamping to the range

15 // 0-255.	19 // 0-255.

16 inline unsigned char ClampTo8(int a) {	20 inline unsigned char ClampTo8(int a) {

17 if (static_cast<unsigned>(a) < 256)	21 if (static_cast<unsigned>(a) < 256)

18 return a; // Avoid the extra check in the common case.	22 return a; // Avoid the extra check in the common case.

19 if (a < 0)	23 if (a < 0)

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 if (has_alpha)	196 if (has_alpha)

193 accum[3] >>= ConvolutionFilter1D::kShiftBits;	197 accum[3] >>= ConvolutionFilter1D::kShiftBits;

194	198

195 // Store the new pixel.	199 // Store the new pixel.

196 out_row[byte_offset + 0] = ClampTo8(accum[0]);	200 out_row[byte_offset + 0] = ClampTo8(accum[0]);

197 out_row[byte_offset + 1] = ClampTo8(accum[1]);	201 out_row[byte_offset + 1] = ClampTo8(accum[1]);

198 out_row[byte_offset + 2] = ClampTo8(accum[2]);	202 out_row[byte_offset + 2] = ClampTo8(accum[2]);

199 if (has_alpha) {	203 if (has_alpha) {

200 unsigned char alpha = ClampTo8(accum[3]);	204 unsigned char alpha = ClampTo8(accum[3]);

201	205

202 // Make sure the alpha channel doesn't come out larger than any of the	206 // Make sure the alpha channel doesn't come out smaller than any of the

203 // color channels. We use premultipled alpha channels, so this should	207 // color channels. We use premultipled alpha channels, so this should

204 // never happen, but rounding errors will cause this from time to time.	208 // never happen, but rounding errors will cause this from time to time.

205 // These "impossible" colors will cause overflows (and hence random pixel	209 // These "impossible" colors will cause overflows (and hence random pixel

206 // values) when the resulting bitmap is drawn to the screen.	210 // values) when the resulting bitmap is drawn to the screen.

207 //	211 //

208 // We only need to do this when generating the final output row (here).	212 // We only need to do this when generating the final output row (here).

209 int max_color_channel = std::max(out_row[byte_offset + 0],	213 int max_color_channel = std::max(out_row[byte_offset + 0],

210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));	214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));

211 if (alpha < max_color_channel)	215 if (alpha < max_color_channel)

212 out_row[byte_offset + 3] = max_color_channel;	216 out_row[byte_offset + 3] = max_color_channel;

213 else	217 else

214 out_row[byte_offset + 3] = alpha;	218 out_row[byte_offset + 3] = alpha;

215 } else {	219 } else {

216 // No alpha channel, the image is opaque.	220 // No alpha channel, the image is opaque.

217 out_row[byte_offset + 3] = 0xff;	221 out_row[byte_offset + 3] = 0xff;

218 }	222 }

219 }	223 }

220 }	224 }

221	225

	226

	227 // Convolves horizontally along a single row. The row data is given in

	228 // \|src_data\| and continues for the num_values() of the filter.

	229 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

	230 const ConvolutionFilter1D& filter,

	231 unsigned char* out_row) {

	232 #ifdef ARCH_CPU_X86_FAMILY

	233 int num_values = filter.num_values();

	234

	235 int filter_offset, filter_length;

	236 __m128i zero = _mm_setzero_si128();

	237 __m128i mask[4];

	238 // \|mask\| will be used to decimate all extra filter coefficients that are

	239 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	240 // mask[0] is not used in following algorithm.

	241 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	242 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	243 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	244

	245 // Output one pixel each iteration, calculating all channels (RGBA) together.

	246 for (int out_x = 0; out_x < num_values; out_x += 1) {

	247 const ConvolutionFilter1D::Fixed* filter_values =

	248 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	249

	250 __m128i accum = _mm_setzero_si128();

	251

	252 // Compute the first pixel in this row that the filter affects. It will

	253 // touch \|filter_length\| pixels (4 bytes each) after this.

	254 const __m128i* row_to_filter =

	255 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

	256

	257 // We will load and accumulate with four coefficients per iterations.
	brettw 2011/02/25 06:43:02 Grammar nit: iteration (no "s"). Same below for wh Grammar nit: iteration (no "s"). Same below for when you duplicated this. jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > Grammar nit: iteration (no "s"). Same below for when you duplicated this. Done.
	258 for (int j = 0; j < filter_length >> 2; ++j) {

	259

	260 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	261 __m128i coeff, coeff16;

	262 // [16] xx xx xx xx c3 c2 c1 c0

	263 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	264 // [16] xx xx xx xx c1 c1 c0 c0

	265 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	266 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	267 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	268

	269 // Load four pixels => unpack the first two pixels to 16 bits =>

	270 // multiply with coefficients => accumulate the convolution result.

	271 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	272 __m128i src8 = _mm_loadu_si128(row_to_filter);

	273 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	274 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	275 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	276 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	277 // [32] a0c0 b0c0 g0c0 r0c0

	278 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	279 accum = _mm_add_epi32(accum, t);

	280 // [32] a1c1 b1c1 g1c1 r1c1

	281 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	282 accum = _mm_add_epi32(accum, t);

	283

	284 // Duplicate 3rd and 4th coefficients for all channels =>

	285 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients

	286 // => accumulate the convolution results.

	287 // [16] xx xx xx xx c3 c3 c2 c2

	288 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	289 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	290 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	291 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	292 src16 = _mm_unpackhi_epi8(src8, zero);

	293 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	294 mul_lo = _mm_mullo_epi16(src16, coeff16);

	295 // [32] a2c2 b2c2 g2c2 r2c2

	296 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	297 accum = _mm_add_epi32(accum, t);

	298 // [32] a3c3 b3c3 g3c3 r3c3

	299 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	300 accum = _mm_add_epi32(accum, t);

	301

	302 // Advance the pixel and coefficients pointers.

	303 row_to_filter += 1;

	304 filter_values += 4;

	305 }

	306

	307 // When \|filter_length\| is not divisible by 4, we need to decimate some of

	308 // the filter coefficient that was loaded incorrectly to zero; Other than

	309 // that the algorithm is same with above, exceot that the 4th pixel will be

	310 // always absent.

	311 int r = filter_length&3;

	312 if (r) {

	313 // Note: filter_values must be padded to align_up(filter_offset, 8).

	314 __m128i coeff, coeff16;

	315 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	316 // Mask out extra filter taps.

	317 coeff = _mm_and_si128(coeff, mask[r]);

	318 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	319 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	320

	321 // Note: line buffer must be padded to align_up(filter_offset, 16).

	322 // We resolve this by use C-version for the last horizontal line.

	323 __m128i src8 = _mm_loadu_si128(row_to_filter);

	324 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	325 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	326 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	327 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	328 accum = _mm_add_epi32(accum, t);

	329 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	330 accum = _mm_add_epi32(accum, t);

	331

	332 src16 = _mm_unpackhi_epi8(src8, zero);

	333 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	334 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	335 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	336 mul_lo = _mm_mullo_epi16(src16, coeff16);

	337 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	338 accum = _mm_add_epi32(accum, t);

	339 }

	340

	341 // Shift right for fixed point implementation.

	342 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);

	343

	344 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	345 accum = _mm_packs_epi32(accum, zero);

	346 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	347 accum = _mm_packus_epi16(accum, zero);

	348

	349 // Store the pixel value of 32 bits.

	350 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	351 out_row += 4;

	352 }

	353 #endif

	354 }

	355

	356 // Convolves horizontally along four rows. The row data is given in

	357 // \|src_data\| and continues for the num_values() of the filter.

	358 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

	359 // refer to that function for detailed comments.

	360 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],

	361 const ConvolutionFilter1D& filter,

	362 unsigned char* out_row[4]) {

	363 #ifdef ARCH_CPU_X86_FAMILY

	364 int num_values = filter.num_values();

	365

	366 int filter_offset, filter_length;

	367 __m128i zero = _mm_setzero_si128();

	368 __m128i mask[4];

	369 // \|mask\| will be used to decimate all extra filter coefficients that are

	370 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	371 // mask[0] is not used in following algorithm.

	372 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	373 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	374 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	375

	376 // Output one pixel each iteration, calculating all channels (RGBA) together.

	377 for (int i = 0; i < num_values; ++i) {
	brettw 2011/02/25 06:43:02 It's weird this for loop is so different than the It's weird this for loop is so different than the one in the function above. Can you make them look the same (same variable name, probably out_x is better, and use ++ instead of += 1). jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > It's weird this for loop is so different than the one in the function above. Can > you make them look the same (same variable name, probably out_x is better, and > use ++ instead of += 1). Done.
	378 const ConvolutionFilter1D::Fixed* filter_values =

	379 filter.FilterForValue(i, &filter_offset, &filter_length);

	380

	381 // four pixels in a column per iteration.

	382 __m128i accum0 = _mm_setzero_si128();

	383 __m128i accum1 = _mm_setzero_si128();

	384 __m128i accum2 = _mm_setzero_si128();

	385 __m128i accum3 = _mm_setzero_si128();

	386 int start = (filter_offset<<2);

	387 // We will load and accumulate with four coefficients per iterations.

	388 for (int j = 0; j < (filter_length >> 2); ++j) {

	389 __m128i coeff, coeff16lo, coeff16hi;

	390 // [16] xx xx xx xx c3 c2 c1 c0

	391 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	392 // [16] xx xx xx xx c1 c1 c0 c0

	393 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	394 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	395 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	396 // [16] xx xx xx xx c3 c3 c2 c2

	397 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	398 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	399 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	400

	401 __m128i src8, src16, mul_hi, mul_lo, t;

	402

	403 #define ITERATION(src, accum) \

	404 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	405 src16 = _mm_unpacklo_epi8(src8, zero); \

	406 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	407 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	408 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	409 accum = _mm_add_epi32(accum, t); \

	410 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	411 accum = _mm_add_epi32(accum, t); \

	412 src16 = _mm_unpackhi_epi8(src8, zero); \

	413 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	414 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	415 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	416 accum = _mm_add_epi32(accum, t); \

	417 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	418 accum = _mm_add_epi32(accum, t)

	419

	420 ITERATION(src_data[0]+start, accum0);

	421 ITERATION(src_data[1]+start, accum1);

	422 ITERATION(src_data[2]+start, accum2);

	423 ITERATION(src_data[3]+start, accum3);

	424

	425 start += 16;

	426 filter_values += 4;

	427 }

	428

	429 int r = filter_length&3;
	brettw 2011/02/25 06:43:02 Please put a space around the & Please put a space around the & jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > Please put a space around the & Done.
	430 if (r) {

	431 // Note: filter_values must be padded to align_up(filter_offset, 8);

	432 __m128i coeff;

	433 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	434 // Mask out extra filter taps.

	435 coeff = _mm_and_si128(coeff, mask[r]);

	436

	437 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	438 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	439 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	440 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	441 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	442

	443 __m128i src8, src16, mul_hi, mul_lo, t;

	444

	445 ITERATION(src_data[0]+start, accum0);
	brettw 2011/02/25 06:43:02 Spaces around the +'s Spaces around the +'s jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > Spaces around the +'s Done.
	446 ITERATION(src_data[1]+start, accum1);

	447 ITERATION(src_data[2]+start, accum2);

	448 ITERATION(src_data[3]+start, accum3);

	449 }

	450

	451 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	452 accum0 = _mm_packs_epi32(accum0, zero);

	453 accum0 = _mm_packus_epi16(accum0, zero);

	454 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	455 accum1 = _mm_packs_epi32(accum1, zero);

	456 accum1 = _mm_packus_epi16(accum1, zero);

	457 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	458 accum2 = _mm_packs_epi32(accum2, zero);

	459 accum2 = _mm_packus_epi16(accum2, zero);

	460 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	461 accum3 = _mm_packs_epi32(accum3, zero);

	462 accum3 = _mm_packus_epi16(accum3, zero);

	463

	464 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	465 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	466 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	467 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	468

	469 out_row[0] += 4;

	470 out_row[1] += 4;

	471 out_row[2] += 4;

	472 out_row[3] += 4;

	473 }

	474 #endif

	475 }

	476

	477 // Does vertical convolution to produce one output row. The filter values and

	478 // length are given in the first two parameters. These are applied to each

	479 // of the rows pointed to in the \|source_data_rows\| array, with each row

	480 // being \|pixel_width\| wide.

	481 //

	482 // The output must have room for \|pixel_width * 4\| bytes.

	483 template<bool has_alpha>

	484 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	485 int filter_length,

	486 unsigned char* const* source_data_rows,

	487 int pixel_width,

	488 unsigned char* out_row) {

	489 #ifdef ARCH_CPU_X86_FAMILY

	490 int width = pixel_width & ~3;

	491

	492 __m128i zero = _mm_setzero_si128();

	493 __m128i accum0, accum1, accum2, accum3, coeff16;

	494 const __m128i* src;

	495 // Output four pixels per iteration (16 bytes).

	496 for (int i = 0; i < width; i += 4) {
	brettw 2011/02/25 06:43:02 Can you give your loop variables better names than Can you give your loop variables better names than i and j here? Perhaps consider the ones used in the "C" version of the algorithm. jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > Can you give your loop variables better names than i and j here? Perhaps > consider the ones used in the "C" version of the algorithm. Done.
	497

	498 // Accumulated result for each pixel. 32 bits per RGBA channel.

	499 accum0 = _mm_setzero_si128();

	500 accum1 = _mm_setzero_si128();

	501 accum2 = _mm_setzero_si128();

	502 accum3 = _mm_setzero_si128();

	503

	504 // Convolve with one filter coefficient per iteration.

	505 for (int j = 0; j < filter_length; ++j) {

	506

	507 // Duplicate the filter coefficient 8 times.

	508 // [16] cj cj cj cj cj cj cj cj

	509 coeff16 = _mm_set1_epi16(filter_values[j]);

	510

	511 // Load four pixels (16 bytes) together.

	512 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	513 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][i<<2]);

	514 __m128i src8 = _mm_loadu_si128(src);

	515

	516 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>

	517 // multiply with current coefficient => accumulate the result.

	518 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	519 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	520 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	521 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	522 // [32] a0 b0 g0 r0

	523 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	524 accum0 = _mm_add_epi32(accum0, t);

	525 // [32] a1 b1 g1 r1

	526 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	527 accum1 = _mm_add_epi32(accum1, t);

	528

	529 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>

	530 // multiply with current coefficient => accumulate the result.

	531 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	532 src16 = _mm_unpackhi_epi8(src8, zero);

	533 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	534 mul_lo = _mm_mullo_epi16(src16, coeff16);

	535 // [32] a2 b2 g2 r2

	536 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	537 accum2 = _mm_add_epi32(accum2, t);

	538 // [32] a3 b3 g3 r3

	539 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	540 accum3 = _mm_add_epi32(accum3, t);

	541 }

	542

	543 // Shift right for fixed point implementation.

	544 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	545 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	546 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	547 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	548

	549 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	550 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	551 accum0 = _mm_packs_epi32(accum0, accum1);

	552 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	553 accum2 = _mm_packs_epi32(accum2, accum3);

	554

	555 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	556 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	557 accum0 = _mm_packus_epi16(accum0, accum2);

	558

	559 if (has_alpha) {

	560 // Compute the max(ri, gi, bi) for each pixel.

	561 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	562 __m128i a = _mm_srli_epi32(accum0, 8);

	563 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	564 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	565 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	566 a = _mm_srli_epi32(accum0, 16);

	567 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	568 b = _mm_max_epu8(a, b); // Max of r and g and b.

	569 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	570 b = _mm_slli_epi32(b, 24);

	571

	572 // Make sure the value of alpha channel is always larger than maximum

	573 // value of color channels.

	574 accum0 = _mm_max_epu8(b, accum0);

	575 } else {

	576 // Set value of alpha channels to 0xFF.

	577 __m128i mask = _mm_set1_epi32(0xff000000);

	578 accum0 = _mm_or_si128(accum0, mask);

	579 }

	580

	581 // Store the convolution result (16 bytes) and advance the pixel pointers.

	582 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	583 out_row += 16;

	584 }

	585

	586 // When the width of the output is not divisible by 4, We need to save one

	587 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

	588 if (pixel_width & 3) {

	589 accum0 = _mm_setzero_si128();

	590 accum1 = _mm_setzero_si128();

	591 accum2 = _mm_setzero_si128();

	592 for (int j = 0; j < filter_length; ++j) {

	593 coeff16 = _mm_set1_epi16(filter_values[j]);

	594 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	595 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][width<<2]);

	596 __m128i src8 = _mm_loadu_si128(src);

	597 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	598 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	599 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	600 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	601 // [32] a0 b0 g0 r0

	602 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	603 accum0 = _mm_add_epi32(accum0, t);

	604 // [32] a1 b1 g1 r1

	605 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	606 accum1 = _mm_add_epi32(accum1, t);

	607 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	608 src16 = _mm_unpackhi_epi8(src8, zero);

	609 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	610 mul_lo = _mm_mullo_epi16(src16, coeff16);

	611 // [32] a2 b2 g2 r2

	612 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	613 accum2 = _mm_add_epi32(accum2, t);

	614 }

	615

	616 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	617 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	618 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	619 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	620 accum0 = _mm_packs_epi32(accum0, accum1);

	621 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	622 accum2 = _mm_packs_epi32(accum2, zero);

	623 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	624 accum0 = _mm_packus_epi16(accum0, accum2);

	625 if (has_alpha) {

	626 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	627 __m128i a = _mm_srli_epi32(accum0, 8);

	628 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	629 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	630 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	631 a = _mm_srli_epi32(accum0, 16);

	632 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	633 b = _mm_max_epu8(a, b); // Max of r and g and b.

	634 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	635 b = _mm_slli_epi32(b, 24);

	636 accum0 = _mm_max_epu8(b, accum0);

	637 } else {

	638 __m128i mask = _mm_set1_epi32(0xff000000);

	639 accum0 = _mm_or_si128(accum0, mask);

	640 }

	641

	642 for (int i = width; i < pixel_width; ++i) {

	643 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	644 accum0 = _mm_srli_si128(accum0, 4);

	645 out_row += 4;

	646 }

	647 }

	648 #endif

	649 }

	650

222 } // namespace	651 } // namespace

223	652

224 // ConvolutionFilter1D ---------------------------------------------------------	653 // ConvolutionFilter1D ---------------------------------------------------------

225	654

226 ConvolutionFilter1D::ConvolutionFilter1D()	655 ConvolutionFilter1D::ConvolutionFilter1D()

227 : max_filter_(0) {	656 : max_filter_(0) {

228 }	657 }

229	658

230 ConvolutionFilter1D::~ConvolutionFilter1D() {	659 ConvolutionFilter1D::~ConvolutionFilter1D() {

231 }	660 }

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
277 // We pushed filter_length elements onto filter_values_	706 // We pushed filter_length elements onto filter_values_

278 instance.data_location = (static_cast<int>(filter_values_.size()) -	707 instance.data_location = (static_cast<int>(filter_values_.size()) -

279 filter_length);	708 filter_length);

280 instance.offset = filter_offset;	709 instance.offset = filter_offset;

281 instance.length = filter_length;	710 instance.length = filter_length;

282 filters_.push_back(instance);	711 filters_.push_back(instance);

283	712

284 max_filter_ = std::max(max_filter_, filter_length);	713 max_filter_ = std::max(max_filter_, filter_length);

285 }	714 }

286	715

287 // BGRAConvolve2D -------------------------------------------------------------	716 void BGRAConvolve2D_C(const unsigned char* source_data,

288	717 int source_byte_row_stride,

289 void BGRAConvolve2D(const unsigned char* source_data,	718 bool source_has_alpha,

290 int source_byte_row_stride,	719 const ConvolutionFilter1D& filter_x,

291 bool source_has_alpha,	720 const ConvolutionFilter1D& filter_y,

292 const ConvolutionFilter1D& filter_x,	721 int output_byte_row_stride,

293 const ConvolutionFilter1D& filter_y,	722 unsigned char* output) {

294 int output_byte_row_stride,

295 unsigned char* output) {

296 int max_y_filter_size = filter_y.max_filter();	723 int max_y_filter_size = filter_y.max_filter();

297	724

298 // The next row in the input that we will generate a horizontally	725 // The next row in the input that we will generate a horizontally

299 // convolved row for. If the filter doesn't start at the beginning of the	726 // convolved row for. If the filter doesn't start at the beginning of the

300 // image (this is the case when we are only resizing a subset), then we	727 // image (this is the case when we are only resizing a subset), then we

301 // don't want to generate any output rows before that. Compute the starting	728 // don't want to generate any output rows before that. Compute the starting

302 // row for convolution as the first pixel for the first vertical filter.	729 // row for convolution as the first pixel for the first vertical filter.

303 int filter_offset, filter_length;	730 int filter_offset, filter_length;

304 const ConvolutionFilter1D::Fixed* filter_values =	731 const ConvolutionFilter1D::Fixed* filter_values =

305 filter_y.FilterForValue(0, &filter_offset, &filter_length);	732 filter_y.FilterForValue(0, &filter_offset, &filter_length);

306 int next_x_row = filter_offset;	733 int next_x_row = filter_offset;

307	734

308 // We loop over each row in the input doing a horizontal convolution. This	735 // We loop over each row in the input doing a horizontal convolution. This

309 // will result in a horizontally convolved image. We write the results into	736 // will result in a horizontally convolved image. We write the results into

310 // a circular buffer of convolved rows and do vertical convolution as rows	737 // a circular buffer of convolved rows and do vertical convolution as rows

311 // are available. This prevents us from having to store the entire	738 // are available. This prevents us from having to store the entire

312 // intermediate image and helps cache coherency.	739 // intermediate image and helps cache coherency.

313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,	740 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,

314 filter_offset);	741 filter_offset);

315	742

316 // Loop over every possible output row, processing just enough horizontal	743 // Loop over every possible output row, processing just enough horizontal

317 // convolutions to run each subsequent vertical convolution.	744 // convolutions to run each subsequent vertical convolution.

318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);	745 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

319 int num_output_rows = filter_y.num_values();	746 int num_output_rows = filter_y.num_values();

	747

320 for (int out_y = 0; out_y < num_output_rows; out_y++) {	748 for (int out_y = 0; out_y < num_output_rows; out_y++) {

321 filter_values = filter_y.FilterForValue(out_y,	749 filter_values = filter_y.FilterForValue(out_y,

322 &filter_offset, &filter_length);	750 &filter_offset, &filter_length);

323	751

324 // Generate output rows until we have enough to run the current filter.	752 // Generate output rows until we have enough to run the current filter.

325 while (next_x_row < filter_offset + filter_length) {	753 while (next_x_row < filter_offset + filter_length) {

326 if (source_has_alpha) {	754 if (source_has_alpha) {

327 ConvolveHorizontally<true>(	755 ConvolveHorizontally<true>(

328 &source_data[next_x_row * source_byte_row_stride],	756 &source_data[next_x_row * source_byte_row_stride],

329 filter_x, row_buffer.AdvanceRow());	757 filter_x, row_buffer.AdvanceRow());

(...skipping 17 matching lines...) Expand all Loading...
347 // needs.	775 // needs.

348 unsigned char* const* first_row_for_filter =	776 unsigned char* const* first_row_for_filter =

349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];	777 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

350	778

351 if (source_has_alpha) {	779 if (source_has_alpha) {

352 ConvolveVertically<true>(filter_values, filter_length,	780 ConvolveVertically<true>(filter_values, filter_length,

353 first_row_for_filter,	781 first_row_for_filter,

354 filter_x.num_values(), cur_output_row);	782 filter_x.num_values(), cur_output_row);

355 } else {	783 } else {

356 ConvolveVertically<false>(filter_values, filter_length,	784 ConvolveVertically<false>(filter_values, filter_length,

357 first_row_for_filter,	785 first_row_for_filter,

358 filter_x.num_values(), cur_output_row);	786 filter_x.num_values(), cur_output_row);

359 }	787 }

360 }	788 }

361 }	789 }

362	790

	791 // BGRAConvolve2D -------------------------------------------------------------

	792

	793 void BGRAConvolve2D_SSE2(const unsigned char* source_data,

	794 int source_byte_row_stride,

	795 bool source_has_alpha,

	796 const ConvolutionFilter1D& filter_x,

	797 const ConvolutionFilter1D& filter_y,

	798 int output_byte_row_stride,

	799 unsigned char* output) {

	800 int max_y_filter_size = filter_y.max_filter();

	801

	802 // The next row in the input that we will generate a horizontally

	803 // convolved row for. If the filter doesn't start at the beginning of the

	804 // image (this is the case when we are only resizing a subset), then we

	805 // don't want to generate any output rows before that. Compute the starting

	806 // row for convolution as the first pixel for the first vertical filter.

	807 int filter_offset, filter_length;

	808 const ConvolutionFilter1D::Fixed* filter_values =

	809 filter_y.FilterForValue(0, &filter_offset, &filter_length);

	810 int next_x_row = filter_offset;

	811

	812 // We loop over each row in the input doing a horizontal convolution. This

	813 // will result in a horizontally convolved image. We write the results into

	814 // a circular buffer of convolved rows and do vertical convolution as rows

	815 // are available. This prevents us from having to store the entire

	816 // intermediate image and helps cache coherency.

	817 // We will need four extra rows to allow horizontal convolution could be done

	818 // simultaneously. We also padding each row in row buffer to be aligned-up to

	819 // 16 bytes.

	820 // TODO(jiesun): We do not use aligned load from row buffer in vertical

	821 // convolution pass yet. Somehow Windows does not like it.

	822 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

	823 int row_buffer_height = max_y_filter_size + 4;

	824 CircularRowBuffer row_buffer(row_buffer_width,

	825 row_buffer_height,

	826 filter_offset);

	827

	828 // Loop over every possible output row, processing just enough horizontal

	829 // convolutions to run each subsequent vertical convolution.

	830 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

	831 int num_output_rows = filter_y.num_values();

	832

	833 int last_filter_offset, last_filter_length;
	brettw 2011/02/25 06:43:02 Can you comment why you need these? Can you comment why you need these? jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > Can you comment why you need these? Done.
	834 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,

	835 &last_filter_length);

	836

	837 for (int out_y = 0; out_y < num_output_rows; out_y++) {

	838 filter_values = filter_y.FilterForValue(out_y,

	839 &filter_offset, &filter_length);

	840

	841 // Generate output rows until we have enough to run the current filter.

	842 while (next_x_row < filter_offset + filter_length) {

	843 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {

	844 const unsigned char* src[4];

	845 unsigned char* out_row[4];

	846 for (int i = 0; i < 4; ++i) {

	847 src[i] = &source_data[(next_x_row+i) * source_byte_row_stride];

	848 out_row[i] = row_buffer.AdvanceRow();

	849 }

	850 ConvolveHorizontally4_SSE2(src, filter_x, out_row);

	851 next_x_row+=4;

	852 } else {

	853 // For the last row, SSE2 load possibly to access data beyond the

	854 // image area. therefore we use C version here. Hacking into skia

	855 // to add line paddings is not something in my mind.

	856 if (next_x_row == last_filter_offset + last_filter_length - 1) {

	857 if (source_has_alpha)
	brettw 2011/02/25 06:43:02 These multi-line conditionals need {} These multi-line conditionals need {} jiesun 2011/03/07 18:57:15 Done. Show quoted text On 2011/02/25 06:43:02, brettw wrote: > These multi-line conditionals need {} Done.
	858 ConvolveHorizontally<true>(

	859 &source_data[next_x_row * source_byte_row_stride],

	860 filter_x, row_buffer.AdvanceRow());

	861 else

	862 ConvolveHorizontally<false>(

	863 &source_data[next_x_row * source_byte_row_stride],

	864 filter_x, row_buffer.AdvanceRow());

	865 } else {

	866 ConvolveHorizontally_SSE2(

	867 &source_data[next_x_row * source_byte_row_stride],

	868 filter_x, row_buffer.AdvanceRow());

	869 }

	870 next_x_row++;

	871 }

	872 }

	873

	874 // Compute where in the output image this row of final data will go.

	875 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

	876

	877 // Get the list of rows that the circular buffer has, in order.

	878 int first_row_in_circular_buffer;

	879 unsigned char* const* rows_to_convolve =

	880 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

	881

	882 // Now compute the start of the subset of those rows that the filter

	883 // needs.

	884 unsigned char* const* first_row_for_filter =

	885 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

	886

	887 if (source_has_alpha) {

	888 ConvolveVertically_SSE2<true>(filter_values, filter_length,

	889 first_row_for_filter,

	890 filter_x.num_values(), cur_output_row);

	891 } else {

	892 ConvolveVertically_SSE2<false>(filter_values, filter_length,

	893 first_row_for_filter,

	894 filter_x.num_values(), cur_output_row);

	895 }

	896 }

	897 }

	898

	899 void BGRAConvolve2D(const unsigned char* source_data,

	900 int source_byte_row_stride,

	901 bool source_has_alpha,

	902 const ConvolutionFilter1D& filter_x,

	903 const ConvolutionFilter1D& filter_y,

	904 int output_byte_row_stride,

	905 unsigned char* output) {

	906 base::CPU cpu;

	907 if (cpu.has_sse2()) {

	908 BGRAConvolve2D_SSE2(source_data, source_byte_row_stride, source_has_alpha,

	909 filter_x, filter_y, output_byte_row_stride, output);

	910 } else {

	911 BGRAConvolve2D_C(source_data, source_byte_row_stride, source_has_alpha,

	912 filter_x, filter_y, output_byte_row_stride, output);

	913 }

	914 }

	915

363 } // namespace skia	916 } // namespace skia

OLD	NEW

« skia/ext/convolver.h ('K') | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »