skia/ext/convolver_SSE2.cc - Issue 2011713003: Roll skia to 8cc209111876b7c78b5ec577c9221d8ed5e21024

Side by Side Diff: skia/ext/convolver_SSE2.cc

Issue 2011713003: Roll skia to 8cc209111876b7c78b5ec577c9221d8ed5e21024 (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include <algorithm>

	6

	7 #include "skia/ext/convolver.h"

	8 #include "skia/ext/convolver_SSE2.h"

	9 #include "third_party/skia/include/core/SkTypes.h"

	10

	11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h

	12

	13 namespace skia {

	14

	15 // Convolves horizontally along a single row. The row data is given in

	16 // \|src_data\| and continues for the num_values() of the filter.

	17 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

	18 const ConvolutionFilter1D& filter,

	19 unsigned char* out_row,

	20 bool /has_alpha/) {

	21 int num_values = filter.num_values();

	22

	23 int filter_offset, filter_length;

	24 __m128i zero = _mm_setzero_si128();

	25 __m128i mask[4];

	26 // \|mask\| will be used to decimate all extra filter coefficients that are

	27 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	28 // mask[0] is not used in following algorithm.

	29 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	30 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	31 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	32

	33 // Output one pixel each iteration, calculating all channels (RGBA) together.

	34 for (int out_x = 0; out_x < num_values; out_x++) {

	35 const ConvolutionFilter1D::Fixed* filter_values =

	36 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	37

	38 __m128i accum = _mm_setzero_si128();

	39

	40 // Compute the first pixel in this row that the filter affects. It will

	41 // touch \|filter_length\| pixels (4 bytes each) after this.

	42 const __m128i* row_to_filter =

	43 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

	44

	45 // We will load and accumulate with four coefficients per iteration.

	46 for (int filter_x = 0; filter_x<filter_length>> 2; filter_x++) {

	47 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	48 __m128i coeff, coeff16;

	49 // [16] xx xx xx xx c3 c2 c1 c0

	50 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	51 // [16] xx xx xx xx c1 c1 c0 c0

	52 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	53 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	54 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	55

	56 // Load four pixels => unpack the first two pixels to 16 bits =>

	57 // multiply with coefficients => accumulate the convolution result.

	58 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	59 __m128i src8 = _mm_loadu_si128(row_to_filter);

	60 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	61 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	62 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	63 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	64 // [32] a0c0 b0c0 g0c0 r0c0

	65 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	66 accum = _mm_add_epi32(accum, t);

	67 // [32] a1c1 b1c1 g1c1 r1c1

	68 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	69 accum = _mm_add_epi32(accum, t);

	70

	71 // Duplicate 3rd and 4th coefficients for all channels =>

	72 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients

	73 // => accumulate the convolution results.

	74 // [16] xx xx xx xx c3 c3 c2 c2

	75 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	76 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	77 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	78 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	79 src16 = _mm_unpackhi_epi8(src8, zero);

	80 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	81 mul_lo = _mm_mullo_epi16(src16, coeff16);

	82 // [32] a2c2 b2c2 g2c2 r2c2

	83 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	84 accum = _mm_add_epi32(accum, t);

	85 // [32] a3c3 b3c3 g3c3 r3c3

	86 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	87 accum = _mm_add_epi32(accum, t);

	88

	89 // Advance the pixel and coefficients pointers.

	90 row_to_filter += 1;

	91 filter_values += 4;

	92 }

	93

	94 // When \|filter_length\| is not divisible by 4, we need to decimate some of

	95 // the filter coefficient that was loaded incorrectly to zero; Other than

	96 // that the algorithm is same with above, exceot that the 4th pixel will be

	97 // always absent.

	98 int r = filter_length & 3;

	99 if (r) {

	100 // Note: filter_values must be padded to align_up(filter_offset, 8).

	101 __m128i coeff, coeff16;

	102 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	103 // Mask out extra filter taps.

	104 coeff = _mm_and_si128(coeff, mask[r]);

	105 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	106 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	107

	108 // Note: line buffer must be padded to align_up(filter_offset, 16).

	109 // We resolve this by use C-version for the last horizontal line.

	110 __m128i src8 = _mm_loadu_si128(row_to_filter);

	111 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	112 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	113 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	114 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	115 accum = _mm_add_epi32(accum, t);

	116 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	117 accum = _mm_add_epi32(accum, t);

	118

	119 src16 = _mm_unpackhi_epi8(src8, zero);

	120 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	121 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	122 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	123 mul_lo = _mm_mullo_epi16(src16, coeff16);

	124 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	125 accum = _mm_add_epi32(accum, t);

	126 }

	127

	128 // Shift right for fixed point implementation.

	129 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);

	130

	131 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	132 accum = _mm_packs_epi32(accum, zero);

	133 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	134 accum = _mm_packus_epi16(accum, zero);

	135

	136 // Store the pixel value of 32 bits.

	137 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	138 out_row += 4;

	139 }

	140 }

	141

	142 // Convolves horizontally along four rows. The row data is given in

	143 // \|src_data\| and continues for the num_values() of the filter.

	144 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

	145 // refer to that function for detailed comments.

	146 void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

	147 const ConvolutionFilter1D& filter,

	148 unsigned char* out_row[4]) {

	149 int num_values = filter.num_values();

	150

	151 int filter_offset, filter_length;

	152 __m128i zero = _mm_setzero_si128();

	153 __m128i mask[4];

	154 // \|mask\| will be used to decimate all extra filter coefficients that are

	155 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	156 // mask[0] is not used in following algorithm.

	157 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

	158 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

	159 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

	160

	161 // Output one pixel each iteration, calculating all channels (RGBA) together.

	162 for (int out_x = 0; out_x < num_values; out_x++) {

	163 const ConvolutionFilter1D::Fixed* filter_values =

	164 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	165

	166 // four pixels in a column per iteration.

	167 __m128i accum0 = _mm_setzero_si128();

	168 __m128i accum1 = _mm_setzero_si128();

	169 __m128i accum2 = _mm_setzero_si128();

	170 __m128i accum3 = _mm_setzero_si128();

	171 int start = (filter_offset << 2);

	172 // We will load and accumulate with four coefficients per iteration.

	173 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {

	174 __m128i coeff, coeff16lo, coeff16hi;

	175 // [16] xx xx xx xx c3 c2 c1 c0

	176 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	177 // [16] xx xx xx xx c1 c1 c0 c0

	178 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	179 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	180 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	181 // [16] xx xx xx xx c3 c3 c2 c2

	182 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	183 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	184 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	185

	186 __m128i src8, src16, mul_hi, mul_lo, t;

	187

	188 #define ITERATION(src, accum) \

	189 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	190 src16 = _mm_unpacklo_epi8(src8, zero); \

	191 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	192 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	193 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	194 accum = _mm_add_epi32(accum, t); \

	195 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	196 accum = _mm_add_epi32(accum, t); \

	197 src16 = _mm_unpackhi_epi8(src8, zero); \

	198 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	199 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	200 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	201 accum = _mm_add_epi32(accum, t); \

	202 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	203 accum = _mm_add_epi32(accum, t)

	204

	205 ITERATION(src_data[0] + start, accum0);

	206 ITERATION(src_data[1] + start, accum1);

	207 ITERATION(src_data[2] + start, accum2);

	208 ITERATION(src_data[3] + start, accum3);

	209

	210 start += 16;

	211 filter_values += 4;

	212 }

	213

	214 int r = filter_length & 3;

	215 if (r) {

	216 // Note: filter_values must be padded to align_up(filter_offset, 8);

	217 __m128i coeff;

	218 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

	219 // Mask out extra filter taps.

	220 coeff = _mm_and_si128(coeff, mask[r]);

	221

	222 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	223 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	224 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	225 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	226 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	227

	228 __m128i src8, src16, mul_hi, mul_lo, t;

	229

	230 ITERATION(src_data[0] + start, accum0);

	231 ITERATION(src_data[1] + start, accum1);

	232 ITERATION(src_data[2] + start, accum2);

	233 ITERATION(src_data[3] + start, accum3);

	234 }

	235

	236 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	237 accum0 = _mm_packs_epi32(accum0, zero);

	238 accum0 = _mm_packus_epi16(accum0, zero);

	239 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	240 accum1 = _mm_packs_epi32(accum1, zero);

	241 accum1 = _mm_packus_epi16(accum1, zero);

	242 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	243 accum2 = _mm_packs_epi32(accum2, zero);

	244 accum2 = _mm_packus_epi16(accum2, zero);

	245 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	246 accum3 = _mm_packs_epi32(accum3, zero);

	247 accum3 = _mm_packus_epi16(accum3, zero);

	248

	249 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	250 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	251 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	252 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	253

	254 out_row[0] += 4;

	255 out_row[1] += 4;

	256 out_row[2] += 4;

	257 out_row[3] += 4;

	258 }

	259 }

	260

	261 // Does vertical convolution to produce one output row. The filter values and

	262 // length are given in the first two parameters. These are applied to each

	263 // of the rows pointed to in the \|source_data_rows\| array, with each row

	264 // being \|pixel_width\| wide.

	265 //

	266 // The output must have room for \|pixel_width * 4\| bytes.

	267 template <bool has_alpha>

	268 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	269 int filter_length,

	270 unsigned char* const* source_data_rows,

	271 int pixel_width,

	272 unsigned char* out_row) {

	273 int width = pixel_width & ~3;

	274

	275 __m128i zero = _mm_setzero_si128();

	276 __m128i accum0, accum1, accum2, accum3, coeff16;

	277 const __m128i* src;

	278 // Output four pixels per iteration (16 bytes).

	279 for (int out_x = 0; out_x < width; out_x += 4) {

	280 // Accumulated result for each pixel. 32 bits per RGBA channel.

	281 accum0 = _mm_setzero_si128();

	282 accum1 = _mm_setzero_si128();

	283 accum2 = _mm_setzero_si128();

	284 accum3 = _mm_setzero_si128();

	285

	286 // Convolve with one filter coefficient per iteration.

	287 for (int filter_y = 0; filter_y < filter_length; filter_y++) {

	288 // Duplicate the filter coefficient 8 times.

	289 // [16] cj cj cj cj cj cj cj cj

	290 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	291

	292 // Load four pixels (16 bytes) together.

	293 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	294 src = reinterpret_cast<const __m128i*>(

	295 &source_data_rows[filter_y][out_x << 2]);

	296 __m128i src8 = _mm_loadu_si128(src);

	297

	298 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>

	299 // multiply with current coefficient => accumulate the result.

	300 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	301 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	302 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	303 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	304 // [32] a0 b0 g0 r0

	305 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	306 accum0 = _mm_add_epi32(accum0, t);

	307 // [32] a1 b1 g1 r1

	308 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	309 accum1 = _mm_add_epi32(accum1, t);

	310

	311 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>

	312 // multiply with current coefficient => accumulate the result.

	313 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	314 src16 = _mm_unpackhi_epi8(src8, zero);

	315 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	316 mul_lo = _mm_mullo_epi16(src16, coeff16);

	317 // [32] a2 b2 g2 r2

	318 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	319 accum2 = _mm_add_epi32(accum2, t);

	320 // [32] a3 b3 g3 r3

	321 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	322 accum3 = _mm_add_epi32(accum3, t);

	323 }

	324

	325 // Shift right for fixed point implementation.

	326 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	327 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	328 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	329 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

	330

	331 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	332 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	333 accum0 = _mm_packs_epi32(accum0, accum1);

	334 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	335 accum2 = _mm_packs_epi32(accum2, accum3);

	336

	337 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	338 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	339 accum0 = _mm_packus_epi16(accum0, accum2);

	340

	341 if (has_alpha) {

	342 // Compute the max(ri, gi, bi) for each pixel.

	343 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	344 __m128i a = _mm_srli_epi32(accum0, 8);

	345 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	346 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	347 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	348 a = _mm_srli_epi32(accum0, 16);

	349 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	350 b = _mm_max_epu8(a, b); // Max of r and g and b.

	351 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	352 b = _mm_slli_epi32(b, 24);

	353

	354 // Make sure the value of alpha channel is always larger than maximum

	355 // value of color channels.

	356 accum0 = _mm_max_epu8(b, accum0);

	357 } else {

	358 // Set value of alpha channels to 0xFF.

	359 __m128i mask = _mm_set1_epi32(0xff000000);

	360 accum0 = _mm_or_si128(accum0, mask);

	361 }

	362

	363 // Store the convolution result (16 bytes) and advance the pixel pointers.

	364 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	365 out_row += 16;

	366 }

	367

	368 // When the width of the output is not divisible by 4, We need to save one

	369 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

	370 if (pixel_width & 3) {

	371 accum0 = _mm_setzero_si128();

	372 accum1 = _mm_setzero_si128();

	373 accum2 = _mm_setzero_si128();

	374 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

	375 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	376 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	377 src = reinterpret_cast<const __m128i*>(

	378 &source_data_rows[filter_y][width << 2]);

	379 __m128i src8 = _mm_loadu_si128(src);

	380 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	381 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	382 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	383 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	384 // [32] a0 b0 g0 r0

	385 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	386 accum0 = _mm_add_epi32(accum0, t);

	387 // [32] a1 b1 g1 r1

	388 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	389 accum1 = _mm_add_epi32(accum1, t);

	390 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	391 src16 = _mm_unpackhi_epi8(src8, zero);

	392 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	393 mul_lo = _mm_mullo_epi16(src16, coeff16);

	394 // [32] a2 b2 g2 r2

	395 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	396 accum2 = _mm_add_epi32(accum2, t);

	397 }

	398

	399 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

	400 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

	401 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

	402 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	403 accum0 = _mm_packs_epi32(accum0, accum1);

	404 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	405 accum2 = _mm_packs_epi32(accum2, zero);

	406 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	407 accum0 = _mm_packus_epi16(accum0, accum2);

	408 if (has_alpha) {

	409 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	410 __m128i a = _mm_srli_epi32(accum0, 8);

	411 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	412 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	413 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	414 a = _mm_srli_epi32(accum0, 16);

	415 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	416 b = _mm_max_epu8(a, b); // Max of r and g and b.

	417 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	418 b = _mm_slli_epi32(b, 24);

	419 accum0 = _mm_max_epu8(b, accum0);

	420 } else {

	421 __m128i mask = _mm_set1_epi32(0xff000000);

	422 accum0 = _mm_or_si128(accum0, mask);

	423 }

	424

	425 for (int out_x = width; out_x < pixel_width; out_x++) {

	426 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

	427 accum0 = _mm_srli_si128(accum0, 4);

	428 out_row += 4;

	429 }

	430 }

	431 }

	432

	433 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

	434 int filter_length,

	435 unsigned char* const* source_data_rows,

	436 int pixel_width,

	437 unsigned char* out_row,

	438 bool has_alpha) {

	439 if (has_alpha) {

	440 ConvolveVertically_SSE2<true>(filter_values, filter_length,

	441 source_data_rows, pixel_width, out_row);

	442 } else {

	443 ConvolveVertically_SSE2<false>(filter_values, filter_length,

	444 source_data_rows, pixel_width, out_row);

	445 }

	446 }

	447

	448 } // namespace skia

OLD	NEW

« no previous file with comments | « skia/ext/convolver_SSE2.h ('k') | skia/ext/convolver_mips_dspr2.h » ('j') | no next file with comments »