skia/ext/convolver.cc - Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc.

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: try to fix win Created 9 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <algorithm>	5 #include <algorithm>

6	6

7 #include "skia/ext/convolver.h"	7 #include "skia/ext/convolver.h"

8 #include "third_party/skia/include/core/SkTypes.h"	8 #include "third_party/skia/include/core/SkTypes.h"

9	9

	10 #ifdef ARCH_CPU_X86_FAMILY
	evannier 2011/02/14 23:45:13 I believe Chrome coding style recommends: #if defi I believe Chrome coding style recommends: #if defined(XXX) See also below, on why I think the ARCH_CPU_X86 might be the wrong define to use. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > I believe Chrome coding style recommends: > #if defined(XXX) > > See also below, on why I think the ARCH_CPU_X86 might be the wrong define to > use. Done.
	11 #include <emmintrin.h>

	12 #endif

	13

10 namespace skia {	14 namespace skia {

11	15

12 namespace {	16 namespace {

13	17

14 // Converts the argument to an 8-bit unsigned value by clamping to the range	18 // Converts the argument to an 8-bit unsigned value by clamping to the range

15 // 0-255.	19 // 0-255.

16 inline unsigned char ClampTo8(int a) {	20 inline unsigned char ClampTo8(int a) {

17 if (static_cast<unsigned>(a) < 256)	21 if (static_cast<unsigned>(a) < 256)

18 return a; // Avoid the extra check in the common case.	22 return a; // Avoid the extra check in the common case.

19 if (a < 0)	23 if (a < 0)

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 if (has_alpha)	196 if (has_alpha)

193 accum[3] >>= ConvolutionFilter1D::kShiftBits;	197 accum[3] >>= ConvolutionFilter1D::kShiftBits;

194	198

195 // Store the new pixel.	199 // Store the new pixel.

196 out_row[byte_offset + 0] = ClampTo8(accum[0]);	200 out_row[byte_offset + 0] = ClampTo8(accum[0]);

197 out_row[byte_offset + 1] = ClampTo8(accum[1]);	201 out_row[byte_offset + 1] = ClampTo8(accum[1]);

198 out_row[byte_offset + 2] = ClampTo8(accum[2]);	202 out_row[byte_offset + 2] = ClampTo8(accum[2]);

199 if (has_alpha) {	203 if (has_alpha) {

200 unsigned char alpha = ClampTo8(accum[3]);	204 unsigned char alpha = ClampTo8(accum[3]);

201	205

202 // Make sure the alpha channel doesn't come out larger than any of the	206 // Make sure the alpha channel doesn't come out smaller than any of the

203 // color channels. We use premultipled alpha channels, so this should	207 // color channels. We use premultipled alpha channels, so this should

204 // never happen, but rounding errors will cause this from time to time.	208 // never happen, but rounding errors will cause this from time to time.

205 // These "impossible" colors will cause overflows (and hence random pixel	209 // These "impossible" colors will cause overflows (and hence random pixel

206 // values) when the resulting bitmap is drawn to the screen.	210 // values) when the resulting bitmap is drawn to the screen.

207 //	211 //

208 // We only need to do this when generating the final output row (here).	212 // We only need to do this when generating the final output row (here).

209 int max_color_channel = std::max(out_row[byte_offset + 0],	213 int max_color_channel = std::max(out_row[byte_offset + 0],

210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));	214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));

211 if (alpha < max_color_channel)	215 if (alpha < max_color_channel)

212 out_row[byte_offset + 3] = max_color_channel;	216 out_row[byte_offset + 3] = max_color_channel;

213 else	217 else

214 out_row[byte_offset + 3] = alpha;	218 out_row[byte_offset + 3] = alpha;

215 } else {	219 } else {

216 // No alpha channel, the image is opaque.	220 // No alpha channel, the image is opaque.

217 out_row[byte_offset + 3] = 0xff;	221 out_row[byte_offset + 3] = 0xff;

218 }	222 }

219 }	223 }

220 }	224 }

221	225

	226

	227 // Convolves horizontally along a single row. The row data is given in

	228 // \|src_data\| and continues for the num_values() of the filter.

	229 template<bool has_alpha>
	evannier 2011/02/14 23:45:13 has_alpha is not used anywhere, suggesting there i has_alpha is not used anywhere, suggesting there is a bug here. jiesun 2011/02/15 00:19:22 actually, it is always vertical after horizontal p actually, it is always vertical after horizontal pass. therefore horizontal never worry about alpha. but I think you are right on this. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > has_alpha is not used anywhere, suggesting there is a bug here. Done.
	230 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
	evannier 2011/02/14 23:45:13 Maybe move that closer to the the other Horizontal Maybe move that closer to the the other Horizontally for easier comparison (or not). jiesun 2011/02/17 20:17:58 I am grouping all SIMD function into a #ifdef. not I am grouping all SIMD function into a #ifdef. not sure which one is better.
	231 const ConvolutionFilter1D& filter,
	evannier 2011/02/14 23:45:13 here and elsewhere, indentation looks odd. And var here and elsewhere, indentation looks odd. And various lint errors. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > here and elsewhere, indentation looks odd. And various lint errors. Done.
	232 unsigned char* out_row) {

	233 #ifdef ARCH_CPU_X86_FAMILY

	234 int width = filter.num_values();
	evannier 2011/02/14 23:45:13 Maybe name this num_values for better symmetry wit Maybe name this num_values for better symmetry with the non SIMD version. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > Maybe name this num_values for better symmetry with the non SIMD version. Done.
	235

	236 int filter_offset, filter_length;

	237 __m128i zero = _mm_setzero_si128();

	238 for (int i = 0; i < width; i += 1) { // One pixel per iteration.
	evannier 2011/02/14 23:45:13 same comment about the naming. same comment about the naming. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > same comment about the naming. Done.
	239 const ConvolutionFilter1D::Fixed* filter_values =

	240 filter.FilterForValue(i, &filter_offset, &filter_length);

	241

	242 __m128i accum = _mm_setzero_si128();

	243 const unsigned char* start = src_data+(filter_offset<<2);
	evannier 2011/02/14 23:45:13 what is wrong with the way it was written in the o what is wrong with the way it was written in the other code: const unsigned char* row_to_filter = &src_data[filter_offset * 4]; evannier 2011/02/14 23:45:13 Since this is going to be used as a __m128i, I wou Since this is going to be used as a __m128i, I would do, const __m128i* start = reinterpret_cast<const ...>(etc); This avoids the casts below, and makes the start += 16, a start +=1, which might be clearer ... To the compiler. jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > what is wrong with the way it was written in the other code: > const unsigned char* row_to_filter = &src_data[filter_offset * 4]; Done.
	244 // Four filter taps per iteration.

	245 for (int j = 0; j < filter_length >> 2; ++j) {

	246 // [16] xx xx xx xx c3 c2 c1 c0

	247 __m128i coeff = _mm_loadl_epi64((__m128i*)filter_values);
	evannier 2011/02/14 23:45:13 Maybe add a comment explaining that you have ensur Maybe add a comment explaining that you have ensured that this address is 64 bit aligned (well, I assume you did). jiesun 2011/02/15 00:19:22 I think there is no alignment requirement for _mm_ I think there is no alignment requirement for _mm_loadl_epi64. I did not align filter values for each filter. just pad at the end of all filters to make sure the 64 bit load always access valid addresses.
	248 // [16] xx xx xx xx c1 c1 c0 c0

	249 __m128i coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
	evannier 2011/02/14 23:45:13 I have not looked at the resulting assembly, but I I have not looked at the resulting assembly, but I have been fairly disappointed in GCC code generation when it comes to the use of intrisics, and it looks like it might be possible that GCC might have to perform register spill over to the stack. Check what it does, and maybe consider doing the other coeff16 shuffle and unpack right there (it is the same number of registers, but it might help GCC, or not).
	250 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	251 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	252

	253 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	254 __m128i src8 = _mm_loadu_si128((__m128i*)start);

	255 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	256 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	257 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
	evannier 2011/02/14 23:45:13 Because of the many stalls these operations might Because of the many stalls these operations might generate, I would interleave both parts in one single pass. Not sure if this would require more registers. If you have already considered that and timed it, please comment on your choice.
	258 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	259 // [32] a0c0 b0c0 g0c0 r0c0

	260 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	261 accum = _mm_add_epi32(accum, t);

	262 // [32] a1c1 b1c1 g1c1 r1c1

	263 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	264 accum = _mm_add_epi32(accum, t);

	265

	266 // [16] xx xx xx xx c3 c3 c2 c2

	267 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	268 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	269 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	270 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	271 src16 = _mm_unpackhi_epi8(src8, zero);

	272 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	273 mul_lo = _mm_mullo_epi16(src16, coeff16);

	274 // [32] a2c2 b2c2 g2c2 r2c2

	275 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	276 accum = _mm_add_epi32(accum, t);

	277 // [32] a3c3 b3c3 g3c3 r3c3

	278 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	279 accum = _mm_add_epi32(accum, t);

	280

	281 start += 16;

	282 filter_values += 4;

	283 }

	284

	285 // remaining

	286 int r = filter_length&3;

	287 if (r) {

	288 // Note: filter_values must be pad to align_up(filter_offset, 8);
	evannier 2011/02/14 23:45:13 padded padded
	289 __m128i coeff = _mm_loadl_epi64((__m128i*)filter_values);

	290 // Mask out extra filter taps.

	291 __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, r==3?-1:0, r>=2?-1:0, -1);

	292 coeff = _mm_and_si128(coeff, mask);

	293 __m128i coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	294 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	295

	296 // TODO(jiesun): line buffer must be pad to align_up(filter_offset, 16);
	evannier 2011/02/14 23:45:13 padded padded jiesun 2011/02/17 20:17:58 Done. Show quoted text On 2011/02/14 23:45:13, evannier wrote: > padded Done.
	297 __m128i src8 = _mm_loadu_si128((__m128i*)start);

	298 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	299 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	300 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	301 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	302 accum = _mm_add_epi32(accum, t);

	303 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	304 accum = _mm_add_epi32(accum, t);

	305

	306 src16 = _mm_unpackhi_epi8(src8, zero);

	307 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	308 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	309 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	310 mul_lo = _mm_mullo_epi16(src16, coeff16);

	311 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	312 accum = _mm_add_epi32(accum, t);

	313 }

	314

	315 // shift right for fix point implementation before saturation.

	316 accum = _mm_srai_epi32 (accum, ConvolutionFilter1D::kShiftBits);

	317 accum = _mm_packs_epi32(accum, zero);

	318 accum = _mm_packus_epi16(accum, zero);
	evannier 2011/02/14 23:45:13 I am missing something. the first packs_epi32 will I am missing something. the first packs_epi32 will move everything in 16 bit values, sign saturated: // [16] 0, 0, 0, 0, a', r', g', b' The second, will take the 16 bit values, and apply the same thing, except this is unsigned. So, if we have a negative value, say a' is 0xFFFF, the resulting a'' will be 0xFF, right ? (using [8] 0 0 ... 0 a'' r'' g'' b'') Then, instead of having '0', we end up getting 0xFF. As a test, make sure that your convolver handles well negative values as coefficients (to generate a negative value), and very large values for > 0xFF values. jiesun 2011/02/15 00:19:22 according to http://msdn.microsoft.com/en-us/libra according to http://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.80).aspx it will pack 16 bits to 8 bits with unsigned saturation. that's what i want exactly. ?
	319

	320 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	321 out_row += 4;

	322 }

	323 #endif

	324 }

	325

	326 // Convolves horizontally along four rows. The row data is given in

	327 // \|src_data\| and continues for the num_values() of the filter.

	328 template<bool has_alpha>

	329 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],

	330 const ConvolutionFilter1D& filter,

	331 unsigned char* out_row[4]) {

	332 #ifdef ARCH_CPU_X86_FAMILY
	evannier 2011/02/14 23:45:13 Since this code is SSE2, I would use the preproces Since this code is SSE2, I would use the preprocessor SSE2: #if defined __SSE2___ Not sure if this is portable across all platforms, but it will work on GCC. That implies that the code would need to be compiled with -msse2 to be included, which is not acceptable for compatibility with non sse2 processors, since you do not want GCC to generate SSE instructions when it has no right to. So, what I had been thinking was to split off this file into multiple files, compiled for various targets. This way you are guaranteed that the compiler will not complain for x86 targets that do not compile with -msse2 (actually I am not sure whether this is something that all versions of Chrome already do, since our platform can always assume that x86 => sse2 or sse3 for that matter). I do not know if this a good example, but Skia handles this type of issues in this fashion: - Create a function getter: typedef (Foo)(myinputs); Foo getFunction() { Foo* p = platformFunction(); if (p) { return p; } return GenericFoo; } Then, for each platform, you have: PlatformFunction_x86() { if (has_sse2) { // check the right registers. return Foo_SSE2; } return NULL; } The two files containing the functions above being compiled without -msse2 Then, another file: Foo_SSE2.cc Foo_SSE2() { // put the SSE2 code here } this one compiled with SSE2. Again, this might be moot if Chrome assumes that all platforms it runs on have SSE2, but I thought I mentioned it. jiesun 2011/02/15 00:19:22 thanks, I was trying to do that function pointer m thanks, I was trying to do that function pointer magic before, but there are some issues with gcc function pointer with templated function... I am thinking to use virtual function and create different convolver class on different platform.
	333 int width = filter.num_values();

	334

	335 int filter_offset, filter_length;

	336 __m128i zero = _mm_setzero_si128();

	337

	338 for (int i=0; i < width; i+=1) {

	339 const ConvolutionFilter1D::Fixed* filter_values =

	340 filter.FilterForValue(i, &filter_offset, &filter_length);

	341

	342 // four pixels in a column per iteration.

	343 __m128i accum0 = _mm_setzero_si128();

	344 __m128i accum1 = _mm_setzero_si128();

	345 __m128i accum2 = _mm_setzero_si128();

	346 __m128i accum3 = _mm_setzero_si128();

	347 int start = (filter_offset<<2);

	348 for (int j = 0; j < (filter_length >> 2); ++j) {

	349 __m128i coeff, coeff16lo, coeff16hi;

	350 // [16] xx xx xx xx c3 c2 c1 c0

	351 coeff = _mm_loadl_epi64((__m128i*)filter_values);

	352 // [16] xx xx xx xx c1 c1 c0 c0

	353 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	354 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	355 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	356 // [16] xx xx xx xx c3 c3 c2 c2

	357 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	358 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	359 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	360

	361 __m128i src8, src16, mul_hi, mul_lo, t;

	362

	363 #define ITERATION(src, accum) \

	364 src8 = _mm_loadu_si128((__m128i*)(src)); \

	365 src16 = _mm_unpacklo_epi8(src8, zero); \

	366 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	367 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	368 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	369 accum = _mm_add_epi32(accum, t); \

	370 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	371 accum = _mm_add_epi32(accum, t); \

	372 src16 = _mm_unpackhi_epi8(src8, zero); \

	373 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	374 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	375 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	376 accum = _mm_add_epi32(accum, t); \

	377 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	378 accum = _mm_add_epi32(accum, t)

	379

	380 ITERATION(src_data[0]+start, accum0);

	381 ITERATION(src_data[1]+start, accum1);

	382 ITERATION(src_data[2]+start, accum2);

	383 ITERATION(src_data[3]+start, accum3);

	384

	385 start += 16;

	386 filter_values += 4;

	387 }

	388

	389 int r = filter_length&3;

	390 if (r) {

	391 // Note: filter_values must be pad to align_up(filter_offset, 8);

	392 __m128i coeff = _mm_loadl_epi64((__m128i*)filter_values);

	393 __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, r==3?-1:0, r>=2?-1:0, -1);

	394 coeff = _mm_and_si128(coeff, mask);

	395

	396 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	397 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	398 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	399 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	400 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	401

	402 __m128i src8, src16, mul_hi, mul_lo, t;

	403

	404 ITERATION(src_data[0]+start, accum0);

	405 ITERATION(src_data[1]+start, accum1);

	406 ITERATION(src_data[2]+start, accum2);

	407 ITERATION(src_data[3]+start, accum3);

	408 }

	409

	410 accum0 = _mm_srai_epi32 (accum0, ConvolutionFilter1D::kShiftBits);

	411 accum0 = _mm_packs_epi32(accum0, zero);

	412 accum0 = _mm_packus_epi16(accum0, zero);

	413 accum1 = _mm_srai_epi32 (accum1, ConvolutionFilter1D::kShiftBits);

	414 accum1 = _mm_packs_epi32(accum1, zero);

	415 accum1 = _mm_packus_epi16(accum1, zero);

	416 accum2 = _mm_srai_epi32 (accum2, ConvolutionFilter1D::kShiftBits);

	417 accum2 = _mm_packs_epi32(accum2, zero);

	418 accum2 = _mm_packus_epi16(accum2, zero);

	419 accum3 = _mm_srai_epi32 (accum3, ConvolutionFilter1D::kShiftBits);

	420 accum3 = _mm_packs_epi32(accum3, zero);

	421 accum3 = _mm_packus_epi16(accum3, zero);

	422

	423 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	424 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	425 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	426 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	427

	428 out_row[0] += 4;

	429 out_row[1] += 4;

	430 out_row[2] += 4;

	431 out_row[3] += 4;

	432 }

	433 #endif

	434 }

	435

	436 template<bool has_alpha>

	437 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	438 int filter_length,

	439 unsigned char* const* source_data_rows,

	440 int pixel_width,

	441 unsigned char* out_row) {

	442 #ifdef ARCH_CPU_X86_FAMILY

	443 int width = pixel_width & ~3;

	444

	445 __m128i zero = _mm_setzero_si128();

	446 __m128i accum0, accum1, accum2, accum3, coeff16;

	447 for (int i = 0; i < width; i += 4) { // Four pixels per iteration.

	448 accum0 = _mm_setzero_si128();

	449 accum1 = _mm_setzero_si128();

	450 accum2 = _mm_setzero_si128();

	451 accum3 = _mm_setzero_si128();

	452 for (int j = 0; j < filter_length; ++j) {

	453 coeff16 = _mm_set1_epi16(filter_values[j]);

	454

	455 // aligned load due to row_buffer is 16 byte aligned.

	456 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	457 __m128i src8 = _mm_loadu_si128((__m128i*)(&source_data_rows[j][i<<2]));

	458 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	459 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	460 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	461 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	462 // [32] a0 b0 g0 r0

	463 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	464 accum0 = _mm_add_epi32(accum0, t);

	465 // [32] a1 b1 g1 r1

	466 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	467 accum1 = _mm_add_epi32(accum1, t);

	468 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	469 src16 = _mm_unpackhi_epi8(src8, zero);

	470 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	471 mul_lo = _mm_mullo_epi16(src16, coeff16);

	472 // [32] a2 b2 g2 r2

	473 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	474 accum2 = _mm_add_epi32(accum2, t);

	475 // [32] a3 b3 g3 r3

	476 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	477 accum3 = _mm_add_epi32(accum3, t);

	478 }

	479

	480 accum0 = _mm_srai_epi32 (accum0, ConvolutionFilter1D::kShiftBits);

	481 accum1 = _mm_srai_epi32 (accum1, ConvolutionFilter1D::kShiftBits);

	482 accum2 = _mm_srai_epi32 (accum2, ConvolutionFilter1D::kShiftBits);

	483 accum3 = _mm_srai_epi32 (accum3, ConvolutionFilter1D::kShiftBits);

	484 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	485 accum0 = _mm_packs_epi32(accum0, accum1);

	486 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	487 accum2 = _mm_packs_epi32(accum2, accum3);

	488 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	489 accum0 = _mm_packus_epi16(accum0, accum2);

	490

	491 if (has_alpha) {

	492 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	493 __m128i a = _mm_srli_epi32(accum0, 8);

	494 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	495 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	496 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	497 a = _mm_srli_epi32(accum0, 16);

	498 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	499 b = _mm_max_epu8(a, b); // Max of r and g and b.

	500 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	501 b = _mm_slli_epi32(b, 24);

	502 accum0 = _mm_max_epu8(b, accum0);

	503 } else {

	504 __m128i mask = _mm_set_epi32(0xff000000, 0xff000000,

	505 0xff000000, 0xff000000);

	506 accum0 = _mm_or_si128(accum0, mask);

	507 }

	508 _mm_storeu_si128((__m128i*)out_row, accum0);

	509 out_row += 16;

	510 }

	511

	512 if (pixel_width & 3) {

	513 accum0 = _mm_setzero_si128();

	514 accum1 = _mm_setzero_si128();

	515 accum2 = _mm_setzero_si128();

	516 for (int j = 0; j < filter_length; ++j) {

	517 coeff16 = _mm_set1_epi16(filter_values[j]);

	518

	519 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	520 __m128i src8 = _mm_loadu_si128((__m128i*)(&source_data_rows[j][width<<2]) );

	521 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	522 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	523 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	524 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	525 // [32] a0 b0 g0 r0

	526 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	527 accum0 = _mm_add_epi32(accum0, t);

	528 // [32] a1 b1 g1 r1

	529 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	530 accum1 = _mm_add_epi32(accum1, t);

	531 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	532 src16 = _mm_unpackhi_epi8(src8, zero);

	533 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	534 mul_lo = _mm_mullo_epi16(src16, coeff16);

	535 // [32] a2 b2 g2 r2

	536 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	537 accum2 = _mm_add_epi32(accum2, t);

	538 }

	539

	540 accum0 = _mm_srai_epi32 (accum0, ConvolutionFilter1D::kShiftBits);

	541 accum1 = _mm_srai_epi32 (accum1, ConvolutionFilter1D::kShiftBits);

	542 accum2 = _mm_srai_epi32 (accum2, ConvolutionFilter1D::kShiftBits);

	543 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	544 accum0 = _mm_packs_epi32(accum0, accum1);

	545 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	546 accum2 = _mm_packs_epi32(accum2, zero);

	547 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	548 accum0 = _mm_packus_epi16(accum0, accum2);

	549

	550 if (has_alpha) {

	551 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	552 __m128i a = _mm_srli_epi32(accum0, 8);

	553 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	554 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	555 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	556 a = _mm_srli_epi32(accum0, 16);

	557 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	558 b = _mm_max_epu8(a, b); // Max of r and g and b.

	559 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	560 b = _mm_slli_epi32(b, 24);

	561 accum0 = _mm_max_epu8(b, accum0);

	562 } else {

	563 __m128i mask = _mm_set_epi32(0xff000000, 0xff000000,

	564 0xff000000, 0xff000000);

	565 accum0 = _mm_or_si128(accum0, mask);

	566 }

	567

	568 for (int i = width; i < pixel_width; ++i) {

	569 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	570 accum0 = _mm_srli_si128(accum0, 4);

	571 out_row += 4;

	572 }

	573 }

	574 #endif

	575 }

	576

	577

222 } // namespace	578 } // namespace

223	579

224 // ConvolutionFilter1D ---------------------------------------------------------	580 // ConvolutionFilter1D ---------------------------------------------------------

225	581

226 ConvolutionFilter1D::ConvolutionFilter1D()	582 ConvolutionFilter1D::ConvolutionFilter1D()

227 : max_filter_(0) {	583 : max_filter_(0) {

228 }	584 }

229	585

230 ConvolutionFilter1D::~ConvolutionFilter1D() {	586 ConvolutionFilter1D::~ConvolutionFilter1D() {

231 }	587 }

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
285 }	641 }

286	642

287 // BGRAConvolve2D -------------------------------------------------------------	643 // BGRAConvolve2D -------------------------------------------------------------

288	644

289 void BGRAConvolve2D(const unsigned char* source_data,	645 void BGRAConvolve2D(const unsigned char* source_data,

290 int source_byte_row_stride,	646 int source_byte_row_stride,

291 bool source_has_alpha,	647 bool source_has_alpha,

292 const ConvolutionFilter1D& filter_x,	648 const ConvolutionFilter1D& filter_x,

293 const ConvolutionFilter1D& filter_y,	649 const ConvolutionFilter1D& filter_y,

294 int output_byte_row_stride,	650 int output_byte_row_stride,

295 unsigned char* output) {	651 unsigned char* output,

	652 bool use_sse2) {
	evannier 2011/02/14 23:45:13 This does not seem right to be part of the API, gi This does not seem right to be part of the API, given that SSE2 is not a relevant feature from the outside. The problem I understand is that the padding you need to apply is going to be dependent on whether this is going to be set. Suggesting creating a Convolver Engine virtual class which can have a type SSE2, and then knows how to deal with padding and realignment of coefficients. jiesun 2011/02/15 00:19:22 also to test by comparing two version, I had to ov also to test by comparing two version, I had to override the default decision at some level...
296 int max_y_filter_size = filter_y.max_filter();	653 int max_y_filter_size = filter_y.max_filter();

297	654

298 // The next row in the input that we will generate a horizontally	655 // The next row in the input that we will generate a horizontally

299 // convolved row for. If the filter doesn't start at the beginning of the	656 // convolved row for. If the filter doesn't start at the beginning of the

300 // image (this is the case when we are only resizing a subset), then we	657 // image (this is the case when we are only resizing a subset), then we

301 // don't want to generate any output rows before that. Compute the starting	658 // don't want to generate any output rows before that. Compute the starting

302 // row for convolution as the first pixel for the first vertical filter.	659 // row for convolution as the first pixel for the first vertical filter.

303 int filter_offset, filter_length;	660 int filter_offset, filter_length;

304 const ConvolutionFilter1D::Fixed* filter_values =	661 const ConvolutionFilter1D::Fixed* filter_values =

305 filter_y.FilterForValue(0, &filter_offset, &filter_length);	662 filter_y.FilterForValue(0, &filter_offset, &filter_length);

306 int next_x_row = filter_offset;	663 int next_x_row = filter_offset;

307	664

308 // We loop over each row in the input doing a horizontal convolution. This	665 // We loop over each row in the input doing a horizontal convolution. This

309 // will result in a horizontally convolved image. We write the results into	666 // will result in a horizontally convolved image. We write the results into

310 // a circular buffer of convolved rows and do vertical convolution as rows	667 // a circular buffer of convolved rows and do vertical convolution as rows

311 // are available. This prevents us from having to store the entire	668 // are available. This prevents us from having to store the entire

312 // intermediate image and helps cache coherency.	669 // intermediate image and helps cache coherency.

313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,	670 // We will need four extra rows to allow horizontal convolution could be done

	671 // simultaneously.

	672 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

	673 int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);

	674 CircularRowBuffer row_buffer(row_buffer_width,

	675 row_buffer_height,

314 filter_offset);	676 filter_offset);

315	677

316 // Loop over every possible output row, processing just enough horizontal	678 // Loop over every possible output row, processing just enough horizontal

317 // convolutions to run each subsequent vertical convolution.	679 // convolutions to run each subsequent vertical convolution.

318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);	680 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

319 int num_output_rows = filter_y.num_values();	681 int num_output_rows = filter_y.num_values();

	682

	683 int last_filter_offset, last_filter_length;

	684 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,

	685 &last_filter_length);

	686

320 for (int out_y = 0; out_y < num_output_rows; out_y++) {	687 for (int out_y = 0; out_y < num_output_rows; out_y++) {

321 filter_values = filter_y.FilterForValue(out_y,	688 filter_values = filter_y.FilterForValue(out_y,

322 &filter_offset, &filter_length);	689 &filter_offset, &filter_length);

323	690

324 // Generate output rows until we have enough to run the current filter.	691 // Generate output rows until we have enough to run the current filter.

325 while (next_x_row < filter_offset + filter_length) {	692 if (use_sse2) {

326 if (source_has_alpha) {	693 while (next_x_row < filter_offset + filter_length) {

327 ConvolveHorizontally<true>(	694 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {

328 &source_data[next_x_row * source_byte_row_stride],	695 //if (0) {
	fbarchard 2011/02/15 23:16:45 remove if you're done with this remove if you're done with this
329 filter_x, row_buffer.AdvanceRow());	696 const unsigned char* src[4];

330 } else {	697 unsigned char* out_row[4];

331 ConvolveHorizontally<false>(	698 for (int i=0; i<4; i++) {

332 &source_data[next_x_row * source_byte_row_stride],	699 src[i] = &source_data[(next_x_row+i) * source_byte_row_stride];

333 filter_x, row_buffer.AdvanceRow());	700 out_row[i] = row_buffer.AdvanceRow();

	701 }

	702 if (source_has_alpha)

	703 ConvolveHorizontally4_SSE2<true>(src, filter_x, out_row);

	704 else

	705 ConvolveHorizontally4_SSE2<false>(src, filter_x, out_row);

	706 next_x_row+=4;

	707 } else {

	708 // For the last row, SSE2 load possibly to access data beyond the

	709 // image area. therefore we use C version here. Hacking into skia

	710 // is not something in my mind.

	711 if (next_x_row == last_filter_offset + last_filter_length - 1 ) {

	712 if (source_has_alpha)

	713 ConvolveHorizontally<true>(

	714 &source_data[next_x_row * source_byte_row_stride],

	715 filter_x, row_buffer.AdvanceRow());

	716 else

	717 ConvolveHorizontally<false>(

	718 &source_data[next_x_row * source_byte_row_stride],

	719 filter_x, row_buffer.AdvanceRow());

	720 } else {

	721 if (source_has_alpha)

	722 ConvolveHorizontally_SSE2<true>(

	723 &source_data[next_x_row * source_byte_row_stride],

	724 filter_x, row_buffer.AdvanceRow());

	725 else

	726 ConvolveHorizontally_SSE2<false>(

	727 &source_data[next_x_row * source_byte_row_stride],

	728 filter_x, row_buffer.AdvanceRow());

	729 }

	730 next_x_row++;

	731 }

334 }	732 }

335 next_x_row++;	733 } else {

	734 while (next_x_row < filter_offset + filter_length) {

	735 if (source_has_alpha) {

	736 ConvolveHorizontally<true>(

	737 &source_data[next_x_row * source_byte_row_stride],

	738 filter_x, row_buffer.AdvanceRow());

	739 } else {

	740 ConvolveHorizontally<false>(

	741 &source_data[next_x_row * source_byte_row_stride],

	742 filter_x, row_buffer.AdvanceRow());

	743 }

	744 next_x_row++;

	745 }

336 }	746 }

337	747

338 // Compute where in the output image this row of final data will go.	748 // Compute where in the output image this row of final data will go.

339 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];	749 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

340	750

341 // Get the list of rows that the circular buffer has, in order.	751 // Get the list of rows that the circular buffer has, in order.

342 int first_row_in_circular_buffer;	752 int first_row_in_circular_buffer;

343 unsigned char* const* rows_to_convolve =	753 unsigned char* const* rows_to_convolve =

344 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);	754 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

345	755

346 // Now compute the start of the subset of those rows that the filter	756 // Now compute the start of the subset of those rows that the filter

347 // needs.	757 // needs.

348 unsigned char* const* first_row_for_filter =	758 unsigned char* const* first_row_for_filter =

349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];	759 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

350	760

351 if (source_has_alpha) {	761 if (source_has_alpha) {

352 ConvolveVertically<true>(filter_values, filter_length,	762 if (use_sse2) {

353 first_row_for_filter,	763 ConvolveVertically_SSE2<true>(filter_values, filter_length,

354 filter_x.num_values(), cur_output_row);	764 first_row_for_filter,

	765 filter_x.num_values(), cur_output_row);

	766 } else {

	767 ConvolveVertically<true>(filter_values, filter_length,

	768 first_row_for_filter,

	769 filter_x.num_values(), cur_output_row);

	770 }

355 } else {	771 } else {

356 ConvolveVertically<false>(filter_values, filter_length,	772 if (use_sse2) {

357 first_row_for_filter,	773 ConvolveVertically_SSE2<false>(filter_values, filter_length,

358 filter_x.num_values(), cur_output_row);	774 first_row_for_filter,

	775 filter_x.num_values(), cur_output_row);

	776 } else {

	777 ConvolveVertically<false>(filter_values, filter_length,

	778 first_row_for_filter,

	779 filter_x.num_values(), cur_output_row);

	780 }

359 }	781 }

360 }	782 }

361 }	783 }

362	784

363 } // namespace skia	785 } // namespace skia

OLD	NEW

« no previous file with comments | « skia/ext/convolver.h ('k') | skia/ext/image_operations.h » ('j') | skia/ext/image_operations.cc » ('J')