skia/ext/convolver.cc - Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc.

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: oops Created 9 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <algorithm>	5 #include <algorithm>

6	6

7 #include "skia/ext/convolver.h"	7 #include "skia/ext/convolver.h"

8 #include "third_party/skia/include/core/SkTypes.h"	8 #include "third_party/skia/include/core/SkTypes.h"

9	9

	10 #if defined(ARCH_CPU_X86_FAMILY)

	11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h

	12 #endif

	13

10 namespace skia {	14 namespace skia {

11	15

12 namespace {	16 namespace {

13	17

14 // Converts the argument to an 8-bit unsigned value by clamping to the range	18 // Converts the argument to an 8-bit unsigned value by clamping to the range

15 // 0-255.	19 // 0-255.

16 inline unsigned char ClampTo8(int a) {	20 inline unsigned char ClampTo8(int a) {

17 if (static_cast<unsigned>(a) < 256)	21 if (static_cast<unsigned>(a) < 256)

18 return a; // Avoid the extra check in the common case.	22 return a; // Avoid the extra check in the common case.

19 if (a < 0)	23 if (a < 0)

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 if (has_alpha)	196 if (has_alpha)

193 accum[3] >>= ConvolutionFilter1D::kShiftBits;	197 accum[3] >>= ConvolutionFilter1D::kShiftBits;

194	198

195 // Store the new pixel.	199 // Store the new pixel.

196 out_row[byte_offset + 0] = ClampTo8(accum[0]);	200 out_row[byte_offset + 0] = ClampTo8(accum[0]);

197 out_row[byte_offset + 1] = ClampTo8(accum[1]);	201 out_row[byte_offset + 1] = ClampTo8(accum[1]);

198 out_row[byte_offset + 2] = ClampTo8(accum[2]);	202 out_row[byte_offset + 2] = ClampTo8(accum[2]);

199 if (has_alpha) {	203 if (has_alpha) {

200 unsigned char alpha = ClampTo8(accum[3]);	204 unsigned char alpha = ClampTo8(accum[3]);

201	205

202 // Make sure the alpha channel doesn't come out larger than any of the	206 // Make sure the alpha channel doesn't come out smaller than any of the

203 // color channels. We use premultipled alpha channels, so this should	207 // color channels. We use premultipled alpha channels, so this should

204 // never happen, but rounding errors will cause this from time to time.	208 // never happen, but rounding errors will cause this from time to time.

205 // These "impossible" colors will cause overflows (and hence random pixel	209 // These "impossible" colors will cause overflows (and hence random pixel

206 // values) when the resulting bitmap is drawn to the screen.	210 // values) when the resulting bitmap is drawn to the screen.

207 //	211 //

208 // We only need to do this when generating the final output row (here).	212 // We only need to do this when generating the final output row (here).

209 int max_color_channel = std::max(out_row[byte_offset + 0],	213 int max_color_channel = std::max(out_row[byte_offset + 0],

210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));	214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));

211 if (alpha < max_color_channel)	215 if (alpha < max_color_channel)

212 out_row[byte_offset + 3] = max_color_channel;	216 out_row[byte_offset + 3] = max_color_channel;

213 else	217 else

214 out_row[byte_offset + 3] = alpha;	218 out_row[byte_offset + 3] = alpha;

215 } else {	219 } else {

216 // No alpha channel, the image is opaque.	220 // No alpha channel, the image is opaque.

217 out_row[byte_offset + 3] = 0xff;	221 out_row[byte_offset + 3] = 0xff;

218 }	222 }

219 }	223 }

220 }	224 }

221	225

	226

	227 // Convolves horizontally along a single row. The row data is given in

	228 // \|src_data\| and continues for the num_values() of the filter.

	229 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

	230 const ConvolutionFilter1D& filter,

	231 unsigned char* out_row) {

	232 #ifdef ARCH_CPU_X86_FAMILY

	233 int num_values = filter.num_values();

	234

	235 int filter_offset, filter_length;

	236 __m128i zero = _mm_setzero_si128();

	237 __m128i mask[4];
	brettw 2011/02/21 04:45:45 What's mask[0] for? Can you provide a comment for What's mask[0] for? Can you provide a comment for what this is for and note you you don't need mask[0]? jiesun 2011/02/22 21:37:03 yes, mask[0] is not used. yes, mask[0] is not used.
	238 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	239 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	240 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	241

	242 for (int out_x = 0; out_x < num_values; out_x += 1) {

	243 const ConvolutionFilter1D::Fixed* filter_values =

	244 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	245

	246 __m128i accum = _mm_setzero_si128();

	247

	248 const __m128i* row_to_filter =
	brettw 2011/02/21 04:45:45 Can you comment what this means? Can you comment what this means? jiesun 2011/02/22 21:37:03 Done. Show quoted text On 2011/02/21 04:45:45, brettw wrote: > Can you comment what this means? Done.
	249 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

	250 // Four filter taps per iteration.

	251 for (int j = 0; j < filter_length >> 2; ++j) {

	252 __m128i coeff, coeff16;
	brettw 2011/02/21 04:45:45 For each of the "blocks" of SSE code you're writte For each of the "blocks" of SSE code you're written, can you provide comments explaining in English what you're doing and what the contents of the different variables will be at the end? jiesun 2011/02/22 21:37:03 Done. Show quoted text On 2011/02/21 04:45:45, brettw wrote: > For each of the "blocks" of SSE code you're written, can you provide comments > explaining in English what you're doing and what the contents of the different > variables will be at the end? Done.
	253 // [16] xx xx xx xx c3 c2 c1 c0

	254 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	255 // [16] xx xx xx xx c1 c1 c0 c0

	256 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	257 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	258 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	259

	260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	261 __m128i src8 = _mm_loadu_si128(row_to_filter);

	262 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	263 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	264 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	265 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	266 // [32] a0c0 b0c0 g0c0 r0c0

	267 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	268 accum = _mm_add_epi32(accum, t);

	269 // [32] a1c1 b1c1 g1c1 r1c1

	270 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	271 accum = _mm_add_epi32(accum, t);

	272

	273 // [16] xx xx xx xx c3 c3 c2 c2

	274 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	275 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	276 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	277 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	278 src16 = _mm_unpackhi_epi8(src8, zero);

	279 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	280 mul_lo = _mm_mullo_epi16(src16, coeff16);

	281 // [32] a2c2 b2c2 g2c2 r2c2

	282 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	283 accum = _mm_add_epi32(accum, t);

	284 // [32] a3c3 b3c3 g3c3 r3c3

	285 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	286 accum = _mm_add_epi32(accum, t);

	287

	288 row_to_filter += 1;

	289 filter_values += 4;

	290 }

	291

	292 // remaining
	brettw 2011/02/21 04:45:45 Can you provide a better comment here? Can you provide a better comment here? jiesun 2011/02/22 21:37:03 Done. Show quoted text On 2011/02/21 04:45:45, brettw wrote: > Can you provide a better comment here? Done.
	293 int r = filter_length&3;

	294 if (r) {

	295 // Note: filter_values must be padded to align_up(filter_offset, 8).

	296 __m128i coeff, coeff16;

	297 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	298 // Mask out extra filter taps.

	299 coeff = _mm_and_si128(coeff, mask[r]);

	300 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	301 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	302

	303 // Note: line buffer must be padded to align_up(filter_offset, 16).

	304 // We resolve this by use C-version for the last horizontal line.

	305 __m128i src8 = _mm_loadu_si128(row_to_filter);

	306 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	307 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	308 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	309 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	310 accum = _mm_add_epi32(accum, t);

	311 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	312 accum = _mm_add_epi32(accum, t);

	313

	314 src16 = _mm_unpackhi_epi8(src8, zero);

	315 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	316 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	317 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	318 mul_lo = _mm_mullo_epi16(src16, coeff16);

	319 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	320 accum = _mm_add_epi32(accum, t);

	321 }

	322

	323 // shift right for fix point implementation before saturation.

	324 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);

	325 accum = _mm_packs_epi32(accum, zero);

	326 accum = _mm_packus_epi16(accum, zero);

	327

	328 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	329 out_row += 4;

	330 }

	331 #endif

	332 }

	333

	334 // Convolves horizontally along four rows. The row data is given in

	335 // \|src_data\| and continues for the num_values() of the filter.

	336 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],

	337 const ConvolutionFilter1D& filter,

	338 unsigned char* out_row[4]) {

	339 #ifdef ARCH_CPU_X86_FAMILY

	340 int width = filter.num_values();

	341

	342 int filter_offset, filter_length;

	343 __m128i zero = _mm_setzero_si128();

	344 __m128i mask[4];

	345 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	346 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	347 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	348

	349 for (int i = 0; i < width; ++i) {

	350 const ConvolutionFilter1D::Fixed* filter_values =

	351 filter.FilterForValue(i, &filter_offset, &filter_length);

	352

	353 // four pixels in a column per iteration.

	354 __m128i accum0 = _mm_setzero_si128();

	355 __m128i accum1 = _mm_setzero_si128();

	356 __m128i accum2 = _mm_setzero_si128();

	357 __m128i accum3 = _mm_setzero_si128();

	358 int start = (filter_offset<<2);

	359 for (int j = 0; j < (filter_length >> 2); ++j) {

	360 __m128i coeff, coeff16lo, coeff16hi;

	361 // [16] xx xx xx xx c3 c2 c1 c0

	362 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	363 // [16] xx xx xx xx c1 c1 c0 c0

	364 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	365 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	366 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	367 // [16] xx xx xx xx c3 c3 c2 c2

	368 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	369 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	370 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	371

	372 __m128i src8, src16, mul_hi, mul_lo, t;

	373

	374 #define ITERATION(src, accum) \

	375 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	376 src16 = _mm_unpacklo_epi8(src8, zero); \

	377 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	378 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	379 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	380 accum = _mm_add_epi32(accum, t); \

	381 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	382 accum = _mm_add_epi32(accum, t); \

	383 src16 = _mm_unpackhi_epi8(src8, zero); \

	384 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	385 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	386 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	387 accum = _mm_add_epi32(accum, t); \

	388 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	389 accum = _mm_add_epi32(accum, t)

	390

	391 ITERATION(src_data[0]+start, accum0);

	392 ITERATION(src_data[1]+start, accum1);

	393 ITERATION(src_data[2]+start, accum2);

	394 ITERATION(src_data[3]+start, accum3);

	395

	396 start += 16;

	397 filter_values += 4;

	398 }

	399

	400 int r = filter_length&3;

	401 if (r) {

	402 // Note: filter_values must be padded to align_up(filter_offset, 8);

	403 __m128i coeff;

	404 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	405 // Mask out extra filter taps.

	406 coeff = _mm_and_si128(coeff, mask[r]);

	407

	408 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	409 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	410 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	411 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	412 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	413

	414 __m128i src8, src16, mul_hi, mul_lo, t;

	415

	416 ITERATION(src_data[0]+start, accum0);

	417 ITERATION(src_data[1]+start, accum1);

	418 ITERATION(src_data[2]+start, accum2);

	419 ITERATION(src_data[3]+start, accum3);

	420 }

	421

	422 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	423 accum0 = _mm_packs_epi32(accum0, zero);

	424 accum0 = _mm_packus_epi16(accum0, zero);

	425 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	426 accum1 = _mm_packs_epi32(accum1, zero);

	427 accum1 = _mm_packus_epi16(accum1, zero);

	428 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	429 accum2 = _mm_packs_epi32(accum2, zero);

	430 accum2 = _mm_packus_epi16(accum2, zero);

	431 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	432 accum3 = _mm_packs_epi32(accum3, zero);

	433 accum3 = _mm_packus_epi16(accum3, zero);

	434

	435 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	436 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	437 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	438 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	439

	440 out_row[0] += 4;

	441 out_row[1] += 4;

	442 out_row[2] += 4;

	443 out_row[3] += 4;

	444 }

	445 #endif

	446 }

	447

	448 template<bool has_alpha>

	449 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	450 int filter_length,
	brettw 2011/02/21 04:45:45 Check indentation Check indentation jiesun 2011/02/22 21:37:03 Done. Show quoted text On 2011/02/21 04:45:45, brettw wrote: > Check indentation Done.
	451 unsigned char* const* source_data_rows,

	452 int pixel_width,

	453 unsigned char* out_row) {

	454 #ifdef ARCH_CPU_X86_FAMILY

	455 int width = pixel_width & ~3;

	456

	457 __m128i zero = _mm_setzero_si128();

	458 __m128i accum0, accum1, accum2, accum3, coeff16;

	459 const __m128i* src;

	460 for (int i = 0; i < width; i += 4) { // Four pixels per iteration.

	461 accum0 = _mm_setzero_si128();

	462 accum1 = _mm_setzero_si128();

	463 accum2 = _mm_setzero_si128();

	464 accum3 = _mm_setzero_si128();

	465 for (int j = 0; j < filter_length; ++j) {

	466 coeff16 = _mm_set1_epi16(filter_values[j]);

	467

	468 // aligned load due to row_buffer is 16 byte aligned.

	469 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	470 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][i<<2]);

	471 __m128i src8 = _mm_loadu_si128(src);

	472 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	473 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	474 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	475 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	476 // [32] a0 b0 g0 r0

	477 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	478 accum0 = _mm_add_epi32(accum0, t);

	479 // [32] a1 b1 g1 r1

	480 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	481 accum1 = _mm_add_epi32(accum1, t);

	482 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	483 src16 = _mm_unpackhi_epi8(src8, zero);

	484 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	485 mul_lo = _mm_mullo_epi16(src16, coeff16);

	486 // [32] a2 b2 g2 r2

	487 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	488 accum2 = _mm_add_epi32(accum2, t);

	489 // [32] a3 b3 g3 r3

	490 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	491 accum3 = _mm_add_epi32(accum3, t);

	492 }

	493

	494 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	495 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	496 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	497 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	498 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	499 accum0 = _mm_packs_epi32(accum0, accum1);

	500 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	501 accum2 = _mm_packs_epi32(accum2, accum3);

	502 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	503 accum0 = _mm_packus_epi16(accum0, accum2);

	504 if (has_alpha) {

	505 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	506 __m128i a = _mm_srli_epi32(accum0, 8);

	507 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	508 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	509 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	510 a = _mm_srli_epi32(accum0, 16);

	511 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	512 b = _mm_max_epu8(a, b); // Max of r and g and b.

	513 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	514 b = _mm_slli_epi32(b, 24);

	515 accum0 = _mm_max_epu8(b, accum0);

	516 } else {

	517 __m128i mask = _mm_set1_epi32(0xff000000);

	518 accum0 = _mm_or_si128(accum0, mask);

	519 }

	520 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	521 out_row += 16;

	522 }

	523

	524 if (pixel_width & 3) {

	525 accum0 = _mm_setzero_si128();

	526 accum1 = _mm_setzero_si128();

	527 accum2 = _mm_setzero_si128();

	528 for (int j = 0; j < filter_length; ++j) {

	529 coeff16 = _mm_set1_epi16(filter_values[j]);

	530 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	531 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][width<<2]);

	532 __m128i src8 = _mm_loadu_si128(src);

	533 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	534 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	535 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	536 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	537 // [32] a0 b0 g0 r0

	538 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	539 accum0 = _mm_add_epi32(accum0, t);

	540 // [32] a1 b1 g1 r1

	541 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	542 accum1 = _mm_add_epi32(accum1, t);

	543 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	544 src16 = _mm_unpackhi_epi8(src8, zero);

	545 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	546 mul_lo = _mm_mullo_epi16(src16, coeff16);

	547 // [32] a2 b2 g2 r2

	548 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	549 accum2 = _mm_add_epi32(accum2, t);

	550 }

	551

	552 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	553 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	554 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	555 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	556 accum0 = _mm_packs_epi32(accum0, accum1);

	557 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	558 accum2 = _mm_packs_epi32(accum2, zero);

	559 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	560 accum0 = _mm_packus_epi16(accum0, accum2);

	561 if (has_alpha) {

	562 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	563 __m128i a = _mm_srli_epi32(accum0, 8);

	564 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	565 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	566 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	567 a = _mm_srli_epi32(accum0, 16);

	568 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	569 b = _mm_max_epu8(a, b); // Max of r and g and b.

	570 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	571 b = _mm_slli_epi32(b, 24);

	572 accum0 = _mm_max_epu8(b, accum0);

	573 } else {

	574 __m128i mask = _mm_set1_epi32(0xff000000);

	575 accum0 = _mm_or_si128(accum0, mask);

	576 }

	577

	578 for (int i = width; i < pixel_width; ++i) {

	579 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	580 accum0 = _mm_srli_si128(accum0, 4);

	581 out_row += 4;

	582 }

	583 }

	584 #endif

	585 }

	586

222 } // namespace	587 } // namespace

223	588

224 // ConvolutionFilter1D ---------------------------------------------------------	589 // ConvolutionFilter1D ---------------------------------------------------------

225	590

226 ConvolutionFilter1D::ConvolutionFilter1D()	591 ConvolutionFilter1D::ConvolutionFilter1D()

227 : max_filter_(0) {	592 : max_filter_(0) {

228 }	593 }

229	594

230 ConvolutionFilter1D::~ConvolutionFilter1D() {	595 ConvolutionFilter1D::~ConvolutionFilter1D() {

231 }	596 }

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
277 // We pushed filter_length elements onto filter_values_	642 // We pushed filter_length elements onto filter_values_

278 instance.data_location = (static_cast<int>(filter_values_.size()) -	643 instance.data_location = (static_cast<int>(filter_values_.size()) -

279 filter_length);	644 filter_length);

280 instance.offset = filter_offset;	645 instance.offset = filter_offset;

281 instance.length = filter_length;	646 instance.length = filter_length;

282 filters_.push_back(instance);	647 filters_.push_back(instance);

283	648

284 max_filter_ = std::max(max_filter_, filter_length);	649 max_filter_ = std::max(max_filter_, filter_length);

285 }	650 }

286	651

287 // BGRAConvolve2D -------------------------------------------------------------	652 void BGRAConvolve2D_C(const unsigned char* source_data,

288	653 int source_byte_row_stride,

289 void BGRAConvolve2D(const unsigned char* source_data,	654 bool source_has_alpha,

290 int source_byte_row_stride,	655 const ConvolutionFilter1D& filter_x,

291 bool source_has_alpha,	656 const ConvolutionFilter1D& filter_y,

292 const ConvolutionFilter1D& filter_x,	657 int output_byte_row_stride,

293 const ConvolutionFilter1D& filter_y,	658 unsigned char* output) {

294 int output_byte_row_stride,

295 unsigned char* output) {

296 int max_y_filter_size = filter_y.max_filter();	659 int max_y_filter_size = filter_y.max_filter();

297	660

298 // The next row in the input that we will generate a horizontally	661 // The next row in the input that we will generate a horizontally

299 // convolved row for. If the filter doesn't start at the beginning of the	662 // convolved row for. If the filter doesn't start at the beginning of the

300 // image (this is the case when we are only resizing a subset), then we	663 // image (this is the case when we are only resizing a subset), then we

301 // don't want to generate any output rows before that. Compute the starting	664 // don't want to generate any output rows before that. Compute the starting

302 // row for convolution as the first pixel for the first vertical filter.	665 // row for convolution as the first pixel for the first vertical filter.

303 int filter_offset, filter_length;	666 int filter_offset, filter_length;

304 const ConvolutionFilter1D::Fixed* filter_values =	667 const ConvolutionFilter1D::Fixed* filter_values =

305 filter_y.FilterForValue(0, &filter_offset, &filter_length);	668 filter_y.FilterForValue(0, &filter_offset, &filter_length);

306 int next_x_row = filter_offset;	669 int next_x_row = filter_offset;

307	670

308 // We loop over each row in the input doing a horizontal convolution. This	671 // We loop over each row in the input doing a horizontal convolution. This

309 // will result in a horizontally convolved image. We write the results into	672 // will result in a horizontally convolved image. We write the results into

310 // a circular buffer of convolved rows and do vertical convolution as rows	673 // a circular buffer of convolved rows and do vertical convolution as rows

311 // are available. This prevents us from having to store the entire	674 // are available. This prevents us from having to store the entire

312 // intermediate image and helps cache coherency.	675 // intermediate image and helps cache coherency.

313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,	676 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,

314 filter_offset);	677 filter_offset);

315	678

316 // Loop over every possible output row, processing just enough horizontal	679 // Loop over every possible output row, processing just enough horizontal

317 // convolutions to run each subsequent vertical convolution.	680 // convolutions to run each subsequent vertical convolution.

318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);	681 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

319 int num_output_rows = filter_y.num_values();	682 int num_output_rows = filter_y.num_values();

	683

	684 int last_filter_offset, last_filter_length;

	685 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,

	686 &last_filter_length);

	687

320 for (int out_y = 0; out_y < num_output_rows; out_y++) {	688 for (int out_y = 0; out_y < num_output_rows; out_y++) {

321 filter_values = filter_y.FilterForValue(out_y,	689 filter_values = filter_y.FilterForValue(out_y,

322 &filter_offset, &filter_length);	690 &filter_offset, &filter_length);

323	691

324 // Generate output rows until we have enough to run the current filter.	692 // Generate output rows until we have enough to run the current filter.

325 while (next_x_row < filter_offset + filter_length) {	693 while (next_x_row < filter_offset + filter_length) {

326 if (source_has_alpha) {	694 if (source_has_alpha) {

327 ConvolveHorizontally<true>(	695 ConvolveHorizontally<true>(

328 &source_data[next_x_row * source_byte_row_stride],	696 &source_data[next_x_row * source_byte_row_stride],

329 filter_x, row_buffer.AdvanceRow());	697 filter_x, row_buffer.AdvanceRow());

(...skipping 17 matching lines...) Expand all Loading...
347 // needs.	715 // needs.

348 unsigned char* const* first_row_for_filter =	716 unsigned char* const* first_row_for_filter =

349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];	717 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

350	718

351 if (source_has_alpha) {	719 if (source_has_alpha) {

352 ConvolveVertically<true>(filter_values, filter_length,	720 ConvolveVertically<true>(filter_values, filter_length,

353 first_row_for_filter,	721 first_row_for_filter,

354 filter_x.num_values(), cur_output_row);	722 filter_x.num_values(), cur_output_row);

355 } else {	723 } else {

356 ConvolveVertically<false>(filter_values, filter_length,	724 ConvolveVertically<false>(filter_values, filter_length,

357 first_row_for_filter,	725 first_row_for_filter,

358 filter_x.num_values(), cur_output_row);	726 filter_x.num_values(), cur_output_row);

359 }	727 }

360 }	728 }

361 }	729 }

362	730

	731 // BGRAConvolve2D -------------------------------------------------------------

	732

	733 void BGRAConvolve2D_SSE2(const unsigned char* source_data,

	734 int source_byte_row_stride,

	735 bool source_has_alpha,

	736 const ConvolutionFilter1D& filter_x,

	737 const ConvolutionFilter1D& filter_y,

	738 int output_byte_row_stride,

	739 unsigned char* output) {

	740 int max_y_filter_size = filter_y.max_filter();

	741

	742 // The next row in the input that we will generate a horizontally

	743 // convolved row for. If the filter doesn't start at the beginning of the

	744 // image (this is the case when we are only resizing a subset), then we

	745 // don't want to generate any output rows before that. Compute the starting

	746 // row for convolution as the first pixel for the first vertical filter.

	747 int filter_offset, filter_length;

	748 const ConvolutionFilter1D::Fixed* filter_values =

	749 filter_y.FilterForValue(0, &filter_offset, &filter_length);

	750 int next_x_row = filter_offset;

	751

	752 // We loop over each row in the input doing a horizontal convolution. This

	753 // will result in a horizontally convolved image. We write the results into

	754 // a circular buffer of convolved rows and do vertical convolution as rows

	755 // are available. This prevents us from having to store the entire

	756 // intermediate image and helps cache coherency.

	757 // We will need four extra rows to allow horizontal convolution could be done

	758 // simultaneously.

	759 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

	760 int row_buffer_height = max_y_filter_size + 4;

	761 CircularRowBuffer row_buffer(row_buffer_width,

	762 row_buffer_height,

	763 filter_offset);

	764

	765 // Loop over every possible output row, processing just enough horizontal

	766 // convolutions to run each subsequent vertical convolution.

	767 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

	768 int num_output_rows = filter_y.num_values();

	769

	770 int last_filter_offset, last_filter_length;

	771 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,

	772 &last_filter_length);

	773

	774 for (int out_y = 0; out_y < num_output_rows; out_y++) {

	775 filter_values = filter_y.FilterForValue(out_y,

	776 &filter_offset, &filter_length);

	777

	778 // Generate output rows until we have enough to run the current filter.

	779 while (next_x_row < filter_offset + filter_length) {

	780 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {

	781 const unsigned char* src[4];

	782 unsigned char* out_row[4];

	783 for (int i = 0; i < 4; ++i) {

	784 src[i] = &source_data[(next_x_row+i) * source_byte_row_stride];

	785 out_row[i] = row_buffer.AdvanceRow();

	786 }

	787 ConvolveHorizontally4_SSE2(src, filter_x, out_row);

	788 next_x_row+=4;

	789 } else {

	790 // For the last row, SSE2 load possibly to access data beyond the

	791 // image area. therefore we use C version here. Hacking into skia

	792 // to add line paddings is not something in my mind.

	793 if (next_x_row == last_filter_offset + last_filter_length - 1) {

	794 if (source_has_alpha)

	795 ConvolveHorizontally<true>(

	796 &source_data[next_x_row * source_byte_row_stride],

	797 filter_x, row_buffer.AdvanceRow());

	798 else

	799 ConvolveHorizontally<false>(

	800 &source_data[next_x_row * source_byte_row_stride],

	801 filter_x, row_buffer.AdvanceRow());

	802 } else {

	803 ConvolveHorizontally_SSE2(

	804 &source_data[next_x_row * source_byte_row_stride],

	805 filter_x, row_buffer.AdvanceRow());

	806 }

	807 next_x_row++;

	808 }

	809 }

	810

	811 // Compute where in the output image this row of final data will go.

	812 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

	813

	814 // Get the list of rows that the circular buffer has, in order.

	815 int first_row_in_circular_buffer;

	816 unsigned char* const* rows_to_convolve =

	817 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

	818

	819 // Now compute the start of the subset of those rows that the filter

	820 // needs.

	821 unsigned char* const* first_row_for_filter =

	822 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

	823

	824 if (source_has_alpha) {

	825 ConvolveVertically_SSE2<true>(filter_values, filter_length,

	826 first_row_for_filter,

	827 filter_x.num_values(), cur_output_row);

	828 } else {

	829 ConvolveVertically_SSE2<false>(filter_values, filter_length,

	830 first_row_for_filter,

	831 filter_x.num_values(), cur_output_row);

	832 }

	833 }

	834 }

	835

	836 void BGRAConvolve2D(const unsigned char* source_data,

	837 int source_byte_row_stride,

	838 bool source_has_alpha,

	839 const ConvolutionFilter1D& filter_x,

	840 const ConvolutionFilter1D& filter_y,

	841 int output_byte_row_stride,

	842 unsigned char* output) {

	843 base::CPU cpu;

	844 if (cpu.has_sse2()) {

	845 BGRAConvolve2D_SSE2(source_data, source_byte_row_stride, source_has_alpha,

	846 filter_x, filter_y, output_byte_row_stride, output);

	847 } else {

	848 BGRAConvolve2D_C(source_data, source_byte_row_stride, source_has_alpha,

	849 filter_x, filter_y, output_byte_row_stride, output);

	850 }

	851 }

	852

363 } // namespace skia	853 } // namespace skia

OLD	NEW

« skia/ext/convolver.h ('K') | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »