src/opts/SkBitmapFilter_opts.h - Issue 2500113004: Port convolve functions to SkOpts

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2500113004: Port convolve functions to SkOpts (Closed)

Patch Set: Fix typo Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkBitmapFilter_opts_DEFINED

	9 #define SkBitmapFilter_opts_DEFINED

	10

	11 #include "SkConvolver.h"

	12

	13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	14 #include <emmintrin.h>

	15 #elif defined(SK_ARM_HAS_NEON)

	16 #include <arm_neon.h>

	17 #endif

	18

	19 namespace SK_OPTS_NS {

	20

	21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	22

	23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,

	24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {

	25 int remainder[4] = {0};

	26 for (int i = 0; i < r; i++) {

	27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];

	28 remainder[0] += coeff * pixelsLeft[i * 4 + 0];

	29 remainder[1] += coeff * pixelsLeft[i * 4 + 1];

	30 remainder[2] += coeff * pixelsLeft[i * 4 + 2];

	31 remainder[3] += coeff * pixelsLeft[i * 4 + 3];

	32 }

	33 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]);

	34 accum = _mm_add_epi32(accum, t);

	35 }

	36

	37 // Convolves horizontally along a single row. The row data is given in

	38 // \|srcData\| and continues for the numValues() of the filter.

	39 void convolve_horizontally(const unsigned char* srcData,

	40 const SkConvolutionFilter1D& filter,

	41 unsigned char* outRow,

	42 bool /hasAlpha/) {

	43 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	44 int numValues = filter.numValues();

	45 for (int outX = 0; outX < numValues; outX++) {

	46 // Get the filter that determines the current output pixel.

	47 int filterOffset, filterLength;

	48 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	49 filter.FilterForValue(outX, &filterOffset, &filterLength);

	50

	51 // Compute the first pixel in this row that the filter affects. It w ill

	52 // touch \|filterLength\| pixels (4 bytes each) after this.

	53 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	54

	55 __m128i zero = _mm_setzero_si128();

	56 __m128i accum = _mm_setzero_si128();

	57

	58 // We will load and accumulate with four coefficients per iteration.

	59 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

	60 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	61 __m128i coeff, coeff16;

	62 // [16] xx xx xx xx c3 c2 c1 c0

	63 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));

	64 // [16] xx xx xx xx c1 c1 c0 c0

	65 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	66 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	67 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	68

	69 // Load four pixels => unpack the first two pixels to 16 bits =>

	70 // multiply with coefficients => accumulate the convolution resu lt.

	71 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	72 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter));

	73 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	74 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	75 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	76 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	77 // [32] a0c0 b0c0 g0c0 r0c0

	78 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	79 accum = _mm_add_epi32(accum, t);

	80 // [32] a1c1 b1c1 g1c1 r1c1

	81 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	82 accum = _mm_add_epi32(accum, t);

	83

	84 // Duplicate 3rd and 4th coefficients for all channels =>

	85 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients

	86 // => accumulate the convolution results.

	87 // [16] xx xx xx xx c3 c3 c2 c2

	88 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	89 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	90 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	91 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	92 src16 = _mm_unpackhi_epi8(src8, zero);

	93 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	94 mul_lo = _mm_mullo_epi16(src16, coeff16);

	95 // [32] a2c2 b2c2 g2c2 r2c2

	96 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	97 accum = _mm_add_epi32(accum, t);

	98 // [32] a3c3 b3c3 g3c3 r3c3

	99 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	100 accum = _mm_add_epi32(accum, t);

	101

	102 // Advance the pixel and coefficients pointers.

	103 rowToFilter += 16;

	104 filterValues += 4;

	105 }

	106

	107 // When \|filterLength\| is not divisible by 4, we accumulate the last 1 - 3

	108 // coefficients one at a time.

	109 int r = filterLength & 3;

	110 if (r) {

	111 int remainderOffset = (filterOffset + filterLength - r) * 4;

	112 AccumRemainder(srcData + remainderOffset, filterValues, accum, r );

	113 }

	114

	115 // Shift right for fixed point implementation.

	116 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

	117

	118 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	119 accum = _mm_packs_epi32(accum, zero);

	120 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	121 accum = _mm_packus_epi16(accum, zero);

	122

	123 // Store the pixel value of 32 bits.

	124 (reinterpret_cast<int>(outRow)) = _mm_cvtsi128_si32(accum);

	125 outRow += 4;

	126 }

	127 }

	128

	129 // Convolves horizontally along four rows. The row data is given in

	130 // \|srcData\| and continues for the numValues() of the filter.

	131 // The algorithm is almost same as \|convolve_horizontally\|. Please

	132 // refer to that function for detailed comments.

	133 void convolve_4_rows_horizontally(const unsigned char* srcData[4],

	134 const SkConvolutionFilter1D& filter,

	135 unsigned char* outRow[4],

	136 size_t outRowBytes) {

	137 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)

	138

	139 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	140 int numValues = filter.numValues();

	141 for (int outX = 0; outX < numValues; outX++) {

	142 int filterOffset, filterLength;

	143 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	144 filter.FilterForValue(outX, &filterOffset, &filterLength);

	145

	146 __m128i zero = _mm_setzero_si128();

	147

	148 // four pixels in a column per iteration.

	149 __m128i accum0 = _mm_setzero_si128();

	150 __m128i accum1 = _mm_setzero_si128();

	151 __m128i accum2 = _mm_setzero_si128();

	152 __m128i accum3 = _mm_setzero_si128();

	153

	154 int start = filterOffset * 4;

	155 // We will load and accumulate with four coefficients per iteration.

	156 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {

	157 __m128i coeff, coeff16lo, coeff16hi;

	158 // [16] xx xx xx xx c3 c2 c1 c0

	159 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));

	160 // [16] xx xx xx xx c1 c1 c0 c0

	161 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	162 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	163 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	164 // [16] xx xx xx xx c3 c3 c2 c2

	165 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	166 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	167 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	168

	169 __m128i src8, src16, mul_hi, mul_lo, t;

	170

	171 #define ITERATION(src, accum) \

	172 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	173 src16 = _mm_unpacklo_epi8(src8, zero); \

	174 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	175 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	176 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	177 accum = _mm_add_epi32(accum, t); \

	178 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	179 accum = _mm_add_epi32(accum, t); \

	180 src16 = _mm_unpackhi_epi8(src8, zero); \

	181 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	182 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	183 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	184 accum = _mm_add_epi32(accum, t); \

	185 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	186 accum = _mm_add_epi32(accum, t)

	187

	188 ITERATION(srcData[0] + start, accum0);

	189 ITERATION(srcData[1] + start, accum1);

	190 ITERATION(srcData[2] + start, accum2);

	191 ITERATION(srcData[3] + start, accum3);

	192

	193 start += 16;

	194 filterValues += 4;

	195 }

	196

	197 int r = filterLength & 3;

	198 if (r) {

	199 int remainderOffset = (filterOffset + filterLength - r) * 4;

	200 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r);

	201 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r);

	202 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r);

	203 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r);

	204 }

	205

	206 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	207 accum0 = _mm_packs_epi32(accum0, zero);

	208 accum0 = _mm_packus_epi16(accum0, zero);

	209 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	210 accum1 = _mm_packs_epi32(accum1, zero);

	211 accum1 = _mm_packus_epi16(accum1, zero);

	212 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	213 accum2 = _mm_packs_epi32(accum2, zero);

	214 accum2 = _mm_packus_epi16(accum2, zero);

	215 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	216 accum3 = _mm_packs_epi32(accum3, zero);

	217 accum3 = _mm_packus_epi16(accum3, zero);

	218

	219 // We seem to be running off the edge here (chromium:491660).

	220 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes );

	221

	222 (reinterpret_cast<int>(outRow[0])) = _mm_cvtsi128_si32(accum0);

	223 (reinterpret_cast<int>(outRow[1])) = _mm_cvtsi128_si32(accum1);

	224 (reinterpret_cast<int>(outRow[2])) = _mm_cvtsi128_si32(accum2);

	225 (reinterpret_cast<int>(outRow[3])) = _mm_cvtsi128_si32(accum3);

	226

	227 outRow[0] += 4;

	228 outRow[1] += 4;

	229 outRow[2] += 4;

	230 outRow[3] += 4;

	231 }

	232 }

	233

	234 // Does vertical convolution to produce one output row. The filter values an d

	235 // length are given in the first two parameters. These are applied to each

	236 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	237 // being \|pixelWidth\| wide.

	238 //

	239 // The output must have room for \|pixelWidth * 4\| bytes.

	240 template<bool hasAlpha>

	241 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	242 int filterLength,

	243 unsigned char* const* sourceDataRows,

	244 int pixelWidth,

	245 unsigned char* outRow) {

	246 // Output four pixels per iteration (16 bytes).

	247 int width = pixelWidth & ~3;

	248 __m128i zero = _mm_setzero_si128();

	249 for (int outX = 0; outX < width; outX += 4) {

	250 // Accumulated result for each pixel. 32 bits per RGBA channel.

	251 __m128i accum0 = _mm_setzero_si128();

	252 __m128i accum1 = _mm_setzero_si128();

	253 __m128i accum2 = _mm_setzero_si128();

	254 __m128i accum3 = _mm_setzero_si128();

	255

	256 // Convolve with one filter coefficient per iteration.

	257 for (int filterY = 0; filterY < filterLength; filterY++) {

	258

	259 // Duplicate the filter coefficient 8 times.

	260 // [16] cj cj cj cj cj cj cj cj

	261 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);

	262

	263 // Load four pixels (16 bytes) together.

	264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	265 const __m128i* src = reinterpret_cast<const __m128i*>(

	266 &sourceDataRows[filterY][outX << 2]);

	267 __m128i src8 = _mm_loadu_si128(src);

	268

	269 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels =>

	270 // multiply with current coefficient => accumulate the result.

	271 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	272 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	273 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	274 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	275 // [32] a0 b0 g0 r0

	276 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	277 accum0 = _mm_add_epi32(accum0, t);

	278 // [32] a1 b1 g1 r1

	279 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	280 accum1 = _mm_add_epi32(accum1, t);

	281

	282 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels =>

	283 // multiply with current coefficient => accumulate the result.

	284 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	285 src16 = _mm_unpackhi_epi8(src8, zero);

	286 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	287 mul_lo = _mm_mullo_epi16(src16, coeff16);

	288 // [32] a2 b2 g2 r2

	289 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	290 accum2 = _mm_add_epi32(accum2, t);

	291 // [32] a3 b3 g3 r3

	292 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	293 accum3 = _mm_add_epi32(accum3, t);

	294 }

	295

	296 // Shift right for fixed point implementation.

	297 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	298 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	299 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	300 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	301

	302 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	303 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	304 accum0 = _mm_packs_epi32(accum0, accum1);

	305 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	306 accum2 = _mm_packs_epi32(accum2, accum3);

	307

	308 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	309 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	310 accum0 = _mm_packus_epi16(accum0, accum2);

	311

	312 if (hasAlpha) {

	313 // Compute the max(ri, gi, bi) for each pixel.

	314 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	315 __m128i a = _mm_srli_epi32(accum0, 8);

	316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	317 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	318 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	319 a = _mm_srli_epi32(accum0, 16);

	320 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	321 b = _mm_max_epu8(a, b); // Max of r and g and b.

	322 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	323 b = _mm_slli_epi32(b, 24);

	324

	325 // Make sure the value of alpha channel is always larger than ma ximum

	326 // value of color channels.

	327 accum0 = _mm_max_epu8(b, accum0);

	328 } else {

	329 // Set value of alpha channels to 0xFF.

	330 __m128i mask = _mm_set1_epi32(0xff000000);

	331 accum0 = _mm_or_si128(accum0, mask);

	332 }

	333

	334 // Store the convolution result (16 bytes) and advance the pixel poi nters.

	335 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);

	336 outRow += 16;

	337 }

	338

	339 // When the width of the output is not divisible by 4, We need to save o ne

	340 // pixel (4 bytes) each time. And also the fourth pixel is always absent .

	341 int r = pixelWidth & 3;

	342 if (r) {

	343 __m128i accum0 = _mm_setzero_si128();

	344 __m128i accum1 = _mm_setzero_si128();

	345 __m128i accum2 = _mm_setzero_si128();

	346 for (int filterY = 0; filterY < filterLength; ++filterY) {

	347 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);

	348 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	349 const __m128i* src = reinterpret_cast<const __m128i*>(

	350 &sourceDataRows[filterY][width << 2]);

	351 __m128i src8 = _mm_loadu_si128(src);

	352 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	353 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	354 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	355 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	356 // [32] a0 b0 g0 r0

	357 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	358 accum0 = _mm_add_epi32(accum0, t);

	359 // [32] a1 b1 g1 r1

	360 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	361 accum1 = _mm_add_epi32(accum1, t);

	362 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	363 src16 = _mm_unpackhi_epi8(src8, zero);

	364 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	365 mul_lo = _mm_mullo_epi16(src16, coeff16);

	366 // [32] a2 b2 g2 r2

	367 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	368 accum2 = _mm_add_epi32(accum2, t);

	369 }

	370

	371 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	372 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	373 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	374 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	375 accum0 = _mm_packs_epi32(accum0, accum1);

	376 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	377 accum2 = _mm_packs_epi32(accum2, zero);

	378 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	379 accum0 = _mm_packus_epi16(accum0, accum2);

	380 if (hasAlpha) {

	381 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	382 __m128i a = _mm_srli_epi32(accum0, 8);

	383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	384 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	385 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	386 a = _mm_srli_epi32(accum0, 16);

	387 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	388 b = _mm_max_epu8(a, b); // Max of r and g and b.

	389 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	390 b = _mm_slli_epi32(b, 24);

	391 accum0 = _mm_max_epu8(b, accum0);

	392 } else {

	393 __m128i mask = _mm_set1_epi32(0xff000000);

	394 accum0 = _mm_or_si128(accum0, mask);

	395 }

	396

	397 for (int i = 0; i < r; i++) {

	398 (reinterpret_cast<int>(outRow)) = _mm_cvtsi128_si32(accum0);

	399 accum0 = _mm_srli_si128(accum0, 4);

	400 outRow += 4;

	401 }

	402 }

	403 }

	404

	405 #elif defined(SK_ARM_HAS_NEON)

	406

	407 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,

	408 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) {

	409 int remainder[4] = {0};

	410 for (int i = 0; i < r; i++) {

	411 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];

	412 remainder[0] += coeff * pixelsLeft[i * 4 + 0];

	413 remainder[1] += coeff * pixelsLeft[i * 4 + 1];

	414 remainder[2] += coeff * pixelsLeft[i * 4 + 2];

	415 remainder[3] += coeff * pixelsLeft[i * 4 + 3];

	416 }

	417 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};

	418 accum += t;

	419 }

	420

	421 // Convolves horizontally along a single row. The row data is given in

	422 // \|srcData\| and continues for the numValues() of the filter.

	423 void convolve_horizontally(const unsigned char* srcData,

	424 const SkConvolutionFilter1D& filter,

	425 unsigned char* outRow,

	426 bool /hasAlpha/) {

	427 // Loop over each pixel on this row in the output image.

	428 int numValues = filter.numValues();

	429 for (int outX = 0; outX < numValues; outX++) {

	430 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	431 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	432 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	433 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	434 // Get the filter that determines the current output pixel.

	435 int filterOffset, filterLength;

	436 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	437 filter.FilterForValue(outX, &filterOffset, &filterLength);

	438

	439 // Compute the first pixel in this row that the filter affects. It w ill

	440 // touch \|filterLength\| pixels (4 bytes each) after this.

	441 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	442

	443 // Apply the filter to the row to get the destination pixel in \|accu m\|.

	444 int32x4_t accum = vdupq_n_s32(0);

	445 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

	446 // Load 4 coefficients

	447 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	448 coeffs = vld1_s16(filterValues);

	449 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));

	450 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));

	451 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));

	452 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));

	453

	454 // Load pixels and calc

	455 uint8x16_t pixels = vld1q_u8(rowToFilter);

	456 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels)));

	457 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels)));

	458

	459 int16x4_t p0_src = vget_low_s16(p01_16);

	460 int16x4_t p1_src = vget_high_s16(p01_16);

	461 int16x4_t p2_src = vget_low_s16(p23_16);

	462 int16x4_t p3_src = vget_high_s16(p23_16);

	463

	464 int32x4_t p0 = vmull_s16(p0_src, coeff0);

	465 int32x4_t p1 = vmull_s16(p1_src, coeff1);

	466 int32x4_t p2 = vmull_s16(p2_src, coeff2);

	467 int32x4_t p3 = vmull_s16(p3_src, coeff3);

	468

	469 accum += p0;

	470 accum += p1;

	471 accum += p2;

	472 accum += p3;

	473

	474 // Advance the pointers

	475 rowToFilter += 16;

	476 filterValues += 4;

	477 }

	478

	479 int r = filterLength & 3;

	480 if (r) {

	481 int remainder_offset = (filterOffset + filterLength - r) * 4;

	482 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);

	483 }

	484

	485 // Bring this value back in range. All of the filter scaling factors

	486 // are in fixed point with kShiftBits bits of fractional part.

	487 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

	488

	489 // Pack and store the new pixel.

	490 int16x4_t accum16 = vqmovn_s32(accum);

	491 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

	492 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0);

	493 outRow += 4;

	494 }

	495 }

	496

	497 // Convolves horizontally along four rows. The row data is given in

	498 // \|srcData\| and continues for the numValues() of the filter.

	499 // The algorithm is almost same as \|convolve_horizontally\|. Please

	500 // refer to that function for detailed comments.

	501 void convolve_4_rows_horizontally(const unsigned char* srcData[4],

	502 const SkConvolutionFilter1D& filter,

	503 unsigned char* outRow[4],

	504 size_t outRowBytes) {

	505 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	506 int numValues = filter.numValues();

	507 for (int outX = 0; outX < numValues; outX++) {

	508

	509 int filterOffset, filterLength;

	510 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	511 filter.FilterForValue(outX, &filterOffset, &filterLength);

	512

	513 // four pixels in a column per iteration.

	514 int32x4_t accum0 = vdupq_n_s32(0);

	515 int32x4_t accum1 = vdupq_n_s32(0);

	516 int32x4_t accum2 = vdupq_n_s32(0);

	517 int32x4_t accum3 = vdupq_n_s32(0);

	518

	519 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	520 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	521 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	522 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	523

	524 int start = filterOffset * 4;

	525

	526 // We will load and accumulate with four coefficients per iteration.

	527 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {

	528 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	529

	530 coeffs = vld1_s16(filterValues);

	531 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));

	532 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));

	533 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));

	534 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));

	535

	536 uint8x16_t pixels;

	537 int16x8_t p01_16, p23_16;

	538 int32x4_t p0, p1, p2, p3;

	539

	540 #define ITERATION(src, accum) \

	541 pixels = vld1q_u8(src); \

	542 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \

	543 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \

	544 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \

	545 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \

	546 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \

	547 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \

	548 accum += p0; \

	549 accum += p1; \

	550 accum += p2; \

	551 accum += p3

	552

	553 ITERATION(srcData[0] + start, accum0);

	554 ITERATION(srcData[1] + start, accum1);

	555 ITERATION(srcData[2] + start, accum2);

	556 ITERATION(srcData[3] + start, accum3);

	557

	558 start += 16;

	559 filterValues += 4;

	560 }

	561

	562 int r = filterLength & 3;

	563 if (r) {

	564 int remainder_offset = (filterOffset + filterLength - r) * 4;

	565 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r);

	566 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r);

	567 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r);

	568 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r);

	569 }

	570

	571 int16x4_t accum16;

	572 uint8x8_t res0, res1, res2, res3;

	573

	574 #define PACK_RESULT(accum, res) \

	575 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \

	576 accum16 = vqmovn_s32(accum); \

	577 res = vqmovun_s16(vcombine_s16(accum16, accum16));

	578

	579 PACK_RESULT(accum0, res0);

	580 PACK_RESULT(accum1, res1);

	581 PACK_RESULT(accum2, res2);

	582 PACK_RESULT(accum3, res3);

	583

	584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0);

	585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0);

	586 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0);

	587 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0);

	588 outRow[0] += 4;

	589 outRow[1] += 4;

	590 outRow[2] += 4;

	591 outRow[3] += 4;

	592 }

	593 }

	594

	595

	596 // Does vertical convolution to produce one output row. The filter values an d

	597 // length are given in the first two parameters. These are applied to each

	598 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	599 // being \|pixelWidth\| wide.

	600 //

	601 // The output must have room for \|pixelWidth * 4\| bytes.

	602 template<bool hasAlpha>

	603 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	604 int filterLength,

	605 unsigned char* const* sourceDataRows,

	606 int pixelWidth,

	607 unsigned char* outRow) {

	608 int width = pixelWidth & ~3;

	609

	610 // Output four pixels per iteration (16 bytes).

	611 for (int outX = 0; outX < width; outX += 4) {

	612

	613 // Accumulated result for each pixel. 32 bits per RGBA channel.

	614 int32x4_t accum0 = vdupq_n_s32(0);

	615 int32x4_t accum1 = vdupq_n_s32(0);

	616 int32x4_t accum2 = vdupq_n_s32(0);

	617 int32x4_t accum3 = vdupq_n_s32(0);

	618

	619 // Convolve with one filter coefficient per iteration.

	620 for (int filterY = 0; filterY < filterLength; filterY++) {

	621

	622 // Duplicate the filter coefficient 4 times.

	623 // [16] cj cj cj cj

	624 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);

	625

	626 // Load four pixels (16 bytes) together.

	627 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	628 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);

	629

	630 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));

	631 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));

	632 int16x4_t src16_0 = vget_low_s16(src16_01);

	633 int16x4_t src16_1 = vget_high_s16(src16_01);

	634 int16x4_t src16_2 = vget_low_s16(src16_23);

	635 int16x4_t src16_3 = vget_high_s16(src16_23);

	636

	637 accum0 += vmull_s16(src16_0, coeff16);

	638 accum1 += vmull_s16(src16_1, coeff16);

	639 accum2 += vmull_s16(src16_2, coeff16);

	640 accum3 += vmull_s16(src16_3, coeff16);

	641 }

	642

	643 // Shift right for fixed point implementation.

	644 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	645 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	646 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	647 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);

	648

	649 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	650 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	651 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));

	652 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	653 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3));

	654

	655 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	656 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	657 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));

	658

	659 if (hasAlpha) {

	660 // Compute the max(ri, gi, bi) for each pixel.

	661 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	662 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));

	663 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	664 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	665 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	666 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));

	667 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	668 b = vmaxq_u8(a, b); // Max of r and g and b.

	669 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	670 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));

	671

	672 // Make sure the value of alpha channel is always larger than ma ximum

	673 // value of color channels.

	674 accum8 = vmaxq_u8(b, accum8);

	675 } else {

	676 // Set value of alpha channels to 0xFF.

	677 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdu pq_n_u32(0xFF000000));

	678 }

	679

	680 // Store the convolution result (16 bytes) and advance the pixel poi nters.

	681 vst1q_u8(outRow, accum8);

	682 outRow += 16;

	683 }

	684

	685 // Process the leftovers when the width of the output is not divisible

	686 // by 4, that is at most 3 pixels.

	687 int r = pixelWidth & 3;

	688 if (r) {

	689

	690 int32x4_t accum0 = vdupq_n_s32(0);

	691 int32x4_t accum1 = vdupq_n_s32(0);

	692 int32x4_t accum2 = vdupq_n_s32(0);

	693

	694 for (int filterY = 0; filterY < filterLength; ++filterY) {

	695 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);

	696

	697 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	698 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ;

	699

	700 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));

	701 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));

	702 int16x4_t src16_0 = vget_low_s16(src16_01);

	703 int16x4_t src16_1 = vget_high_s16(src16_01);

	704 int16x4_t src16_2 = vget_low_s16(src16_23);

	705

	706 accum0 += vmull_s16(src16_0, coeff16);

	707 accum1 += vmull_s16(src16_1, coeff16);

	708 accum2 += vmull_s16(src16_2, coeff16);

	709 }

	710

	711 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	712 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	713 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	714

	715 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));

	716 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2));

	717

	718 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));

	719

	720 if (hasAlpha) {

	721 // Compute the max(ri, gi, bi) for each pixel.

	722 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	723 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));

	724 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	725 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	726 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	727 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));

	728 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	729 b = vmaxq_u8(a, b); // Max of r and g and b.

	730 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	731 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));

	732

	733 // Make sure the value of alpha channel is always larger than ma ximum

	734 // value of color channels.

	735 accum8 = vmaxq_u8(b, accum8);

	736 } else {

	737 // Set value of alpha channels to 0xFF.

	738 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdu pq_n_u32(0xFF000000));

	739 }

	740

	741 switch(r) {

	742 case 1:

	743 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0);

	744 break;

	745 case 2:

	746 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	747 vreinterpret_u32_u8(vget_low_u8(accum8)));

	748 break;

	749 case 3:

	750 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	751 vreinterpret_u32_u8(vget_low_u8(accum8)));

	752 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2);

	753 break;

	754 }

	755 }

	756 }

	757

	758 #else

	759

	760 // Converts the argument to an 8-bit unsigned value by clamping to the range

	761 // 0-255.

	762 inline unsigned char ClampTo8(int a) {

	763 if (static_cast<unsigned>(a) < 256) {

	764 return a; // Avoid the extra check in the common case.

	765 }

	766 if (a < 0) {

	767 return 0;

	768 }

	769 return 255;

	770 }

	771

	772 // Convolves horizontally along a single row. The row data is given in

	773 // \|srcData\| and continues for the numValues() of the filter.

	774 template<bool hasAlpha>

	775 void ConvolveHorizontally(const unsigned char* srcData,

	776 const SkConvolutionFilter1D& filter,

	777 unsigned char* outRow) {

	778 // Loop over each pixel on this row in the output image.

	779 int numValues = filter.numValues();

	780 for (int outX = 0; outX < numValues; outX++) {

	781 // Get the filter that determines the current output pixel.

	782 int filterOffset, filterLength;

	783 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	784 filter.FilterForValue(outX, &filterOffset, &filterLength);

	785

	786 // Compute the first pixel in this row that the filter affects. It w ill

	787 // touch \|filterLength\| pixels (4 bytes each) after this.

	788 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	789

	790 // Apply the filter to the row to get the destination pixel in \|accu m\|.

	791 int accum[4] = {0};

	792 for (int filterX = 0; filterX < filterLength; filterX++) {

	793 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX];

	794 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];

	795 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];

	796 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];

	797 if (hasAlpha) {

	798 accum[3] += curFilter * rowToFilter[filterX * 4 + 3];

	799 }

	800 }

	801

	802 // Bring this value back in range. All of the filter scaling factors

	803 // are in fixed point with kShiftBits bits of fractional part.

	804 accum[0] >>= SkConvolutionFilter1D::kShiftBits;

	805 accum[1] >>= SkConvolutionFilter1D::kShiftBits;

	806 accum[2] >>= SkConvolutionFilter1D::kShiftBits;

	807 if (hasAlpha) {

	808 accum[3] >>= SkConvolutionFilter1D::kShiftBits;

	809 }

	810

	811 // Store the new pixel.

	812 outRow[outX * 4 + 0] = ClampTo8(accum[0]);

	813 outRow[outX * 4 + 1] = ClampTo8(accum[1]);

	814 outRow[outX * 4 + 2] = ClampTo8(accum[2]);

	815 if (hasAlpha) {

	816 outRow[outX * 4 + 3] = ClampTo8(accum[3]);

	817 }

	818 }

	819 }

	820

	821 // Does vertical convolution to produce one output row. The filter values an d

	822 // length are given in the first two parameters. These are applied to each

	823 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	824 // being \|pixelWidth\| wide.

	825 //

	826 // The output must have room for \|pixelWidth * 4\| bytes.

	827 template<bool hasAlpha>

	828 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	829 int filterLength,

	830 unsigned char* const* sourceDataRows,

	831 int pixelWidth,

	832 unsigned char* outRow) {

	833 // We go through each column in the output and do a vertical convolution ,

	834 // generating one output pixel each time.

	835 for (int outX = 0; outX < pixelWidth; outX++) {

	836 // Compute the number of bytes over in each row that the current col umn

	837 // we're convolving starts at. The pixel will cover the next 4 bytes .

	838 int byteOffset = outX * 4;

	839

	840 // Apply the filter to one column of pixels.

	841 int accum[4] = {0};

	842 for (int filterY = 0; filterY < filterLength; filterY++) {

	843 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY];

	844 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];

	845 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];

	846 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];

	847 if (hasAlpha) {

	848 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];

	849 }

	850 }

	851

	852 // Bring this value back in range. All of the filter scaling factors

	853 // are in fixed point with kShiftBits bits of precision.

	854 accum[0] >>= SkConvolutionFilter1D::kShiftBits;

	855 accum[1] >>= SkConvolutionFilter1D::kShiftBits;

	856 accum[2] >>= SkConvolutionFilter1D::kShiftBits;

	857 if (hasAlpha) {

	858 accum[3] >>= SkConvolutionFilter1D::kShiftBits;

	859 }

	860

	861 // Store the new pixel.

	862 outRow[byteOffset + 0] = ClampTo8(accum[0]);

	863 outRow[byteOffset + 1] = ClampTo8(accum[1]);

	864 outRow[byteOffset + 2] = ClampTo8(accum[2]);

	865 if (hasAlpha) {

	866 unsigned char alpha = ClampTo8(accum[3]);

	867

	868 // Make sure the alpha channel doesn't come out smaller than any of the

	869 // color channels. We use premultipled alpha channels, so this s hould

	870 // never happen, but rounding errors will cause this from time t o time.

	871 // These "impossible" colors will cause overflows (and hence ran dom pixel

	872 // values) when the resulting bitmap is drawn to the screen.

	873 //

	874 // We only need to do this when generating the final output row (here).

	875 int maxColorChannel = SkTMax(outRow[byteOffset + 0],

	876 SkTMax(outRow[byteOffset + 1],

	877 outRow[byteOffset + 2]));

	878 if (alpha < maxColorChannel) {

	879 outRow[byteOffset + 3] = maxColorChannel;

	880 } else {

	881 outRow[byteOffset + 3] = alpha;

	882 }

	883 } else {

	884 // No alpha channel, the image is opaque.

	885 outRow[byteOffset + 3] = 0xff;

	886 }

	887 }

	888 }

	889

	890 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally

	891 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles

	892 // suffer here too.

	893 //

	894 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575

	895 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)

	896 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline))

	897 #else

	898 #define SK_MAYBE_DISABLE_VECTORIZATION

	899 #endif

	900

	901 SK_MAYBE_DISABLE_VECTORIZATION

	902 void convolve_horizontally(const unsigned char* srcData,

	903 const SkConvolutionFilter1D& filter,

	904 unsigned char* outRow,

	905 bool hasAlpha) {

	906 if (hasAlpha) {

	907 ConvolveHorizontally<true>(srcData, filter, outRow);

	908 } else {

	909 ConvolveHorizontally<false>(srcData, filter, outRow);

	910 }

	911 }

	912 #undef SK_MAYBE_DISABLE_VECTORIZATION

	913

	914 void (convolve_4_rows_horizontally)(const unsigned char srcData[4],

	915 const SkConvolutionFilter1D& filter,

	916 unsigned char* outRow[4],

	917 size_t outRowBytes)

	918 = nullptr;

	919

	920

	921 #endif

	922

	923 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,

	924 int filterLength,

	925 unsigned char* const* sourceDataRows,

	926 int pixelWidth,

	927 unsigned char* outRow,

	928 bool hasAlpha) {

	929 if (hasAlpha) {

	930 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,

	931 pixelWidth, outRow);

	932 } else {

	933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows ,

	934 pixelWidth, outRow);

	935 }

	936 }

	937

	938 } // namespace SK_OPTS_NS

	939

	940 #endif//SkBitmapFilter_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »