src/opts/SkBitmapFilter_opts.h - Issue 2500113004: Port convolve functions to SkOpts

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2500113004: Port convolve functions to SkOpts (Closed)

Patch Set: Format Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkBitmapFilter_opts_DEFINED

	9 #define SkBitmapFilter_opts_DEFINED

	10

	11 #include "SkConvolver.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	16

	17 #include <emmintrin.h>
	mtklein_C 2016/11/16 14:24:47 This will #include the SSE intrinsics into the SK_ This will #include the SSE intrinsics into the SK_OPTS_NS namespace (sse2, etc). This is somewhat unusual. It probably will work, but it sort of looks like we're doing it because we _have_ to. We usually put this at the top before opening the SK_OPTS_NS namespace. #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 #include <immintrin.h> #endif xiangze.zhang 2016/11/17 02:33:07 Done. Show quoted text On 2016/11/16 14:24:47, mtklein_C wrote: > This will #include the SSE intrinsics into the SK_OPTS_NS namespace (sse2, etc). > This is somewhat unusual. It probably will work, but it sort of looks like > we're doing it because we _have_ to. > > We usually put this at the top before opening the SK_OPTS_NS namespace. > > #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 > #include <immintrin.h> > #endif Done.
	18

	19 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,

	20 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {

	21 int remainder[4] = {0};

	22 for (int i = 0; i < r; i++) {

	23 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];

	24 remainder[0] += coeff * pixelsLeft[i * 4 + 0];

	25 remainder[1] += coeff * pixelsLeft[i * 4 + 1];

	26 remainder[2] += coeff * pixelsLeft[i * 4 + 2];

	27 remainder[3] += coeff * pixelsLeft[i * 4 + 3];

	28 }

	29 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]);

	30 accum = _mm_add_epi32(accum, t);

	31 }

	32

	33 // Convolves horizontally along a single row. The row data is given in

	34 // \|srcData\| and continues for the numValues() of the filter.

	35 void convolve_horizontally(const unsigned char* srcData,

	36 const SkConvolutionFilter1D& filter,

	37 unsigned char* outRow,

	38 bool /hasAlpha/) {

	39 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	40 int numValues = filter.numValues();

	41 for (int outX = 0; outX < numValues; outX++) {

	42 // Get the filter that determines the current output pixel.

	43 int filterOffset, filterLength;

	44 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	45 filter.FilterForValue(outX, &filterOffset, &filterLength);

	46

	47 // Compute the first pixel in this row that the filter affects. It w ill

	48 // touch \|filterLength\| pixels (4 bytes each) after this.

	49 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	50

	51 __m128i zero = _mm_setzero_si128();

	52 __m128i accum = _mm_setzero_si128();

	53

	54 // We will load and accumulate with four coefficients per iteration.

	55 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

	56 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

	57 __m128i coeff, coeff16;

	58 // [16] xx xx xx xx c3 c2 c1 c0

	59 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));

	60 // [16] xx xx xx xx c1 c1 c0 c0

	61 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	62 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	63 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	64

	65 // Load four pixels => unpack the first two pixels to 16 bits =>

	66 // multiply with coefficients => accumulate the convolution resu lt.

	67 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	68 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter));

	69 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	70 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	71 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	72 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	73 // [32] a0c0 b0c0 g0c0 r0c0

	74 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	75 accum = _mm_add_epi32(accum, t);

	76 // [32] a1c1 b1c1 g1c1 r1c1

	77 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	78 accum = _mm_add_epi32(accum, t);

	79

	80 // Duplicate 3rd and 4th coefficients for all channels =>

	81 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients

	82 // => accumulate the convolution results.

	83 // [16] xx xx xx xx c3 c3 c2 c2

	84 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	85 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	86 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	87 // [16] a3 g3 b3 r3 a2 g2 b2 r2

	88 src16 = _mm_unpackhi_epi8(src8, zero);

	89 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	90 mul_lo = _mm_mullo_epi16(src16, coeff16);

	91 // [32] a2c2 b2c2 g2c2 r2c2

	92 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	93 accum = _mm_add_epi32(accum, t);

	94 // [32] a3c3 b3c3 g3c3 r3c3

	95 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	96 accum = _mm_add_epi32(accum, t);

	97

	98 // Advance the pixel and coefficients pointers.

	99 rowToFilter += 16;

	100 filterValues += 4;

	101 }

	102

	103 // When \|filterLength\| is not divisible by 4, we accumulate the last 1 - 3

	104 // coefficients one at a time.

	105 int r = filterLength & 3;

	106 if (r) {

	107 int remainderOffset = (filterOffset + filterLength - r) * 4;

	108 AccumRemainder(srcData + remainderOffset, filterValues, accum, r );

	109 }

	110

	111 // Shift right for fixed point implementation.

	112 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

	113

	114 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	115 accum = _mm_packs_epi32(accum, zero);

	116 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	117 accum = _mm_packus_epi16(accum, zero);

	118

	119 // Store the pixel value of 32 bits.

	120 (reinterpret_cast<int>(outRow)) = _mm_cvtsi128_si32(accum);

	121 outRow += 4;

	122 }

	123 }

	124

	125 // Convolves horizontally along four rows. The row data is given in

	126 // \|srcData\| and continues for the numValues() of the filter.

	127 // The algorithm is almost same as \|convolve_horizontally\|. Please

	128 // refer to that function for detailed comments.

	129 void convolve_4_rows_horizontally(const unsigned char* srcData[4],

	130 const SkConvolutionFilter1D& filter,

	131 unsigned char* outRow[4],

	132 size_t outRowBytes) {

	133 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)

	134

	135 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	136 int numValues = filter.numValues();

	137 for (int outX = 0; outX < numValues; outX++) {

	138 int filterOffset, filterLength;

	139 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	140 filter.FilterForValue(outX, &filterOffset, &filterLength);

	141

	142 __m128i zero = _mm_setzero_si128();

	143

	144 // four pixels in a column per iteration.

	145 __m128i accum0 = _mm_setzero_si128();

	146 __m128i accum1 = _mm_setzero_si128();

	147 __m128i accum2 = _mm_setzero_si128();

	148 __m128i accum3 = _mm_setzero_si128();

	149

	150 int start = filterOffset * 4;

	151 // We will load and accumulate with four coefficients per iteration.

	152 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {

	153 __m128i coeff, coeff16lo, coeff16hi;

	154 // [16] xx xx xx xx c3 c2 c1 c0

	155 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));

	156 // [16] xx xx xx xx c1 c1 c0 c0

	157 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	158 // [16] c1 c1 c1 c1 c0 c0 c0 c0

	159 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	160 // [16] xx xx xx xx c3 c3 c2 c2

	161 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	162 // [16] c3 c3 c3 c3 c2 c2 c2 c2

	163 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	164

	165 __m128i src8, src16, mul_hi, mul_lo, t;

	166

	167 #define ITERATION(src, accum) \

	168 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

	169 src16 = _mm_unpacklo_epi8(src8, zero); \

	170 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

	171 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

	172 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	173 accum = _mm_add_epi32(accum, t); \

	174 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	175 accum = _mm_add_epi32(accum, t); \

	176 src16 = _mm_unpackhi_epi8(src8, zero); \

	177 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

	178 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

	179 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

	180 accum = _mm_add_epi32(accum, t); \

	181 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

	182 accum = _mm_add_epi32(accum, t)

	183

	184 ITERATION(srcData[0] + start, accum0);

	185 ITERATION(srcData[1] + start, accum1);

	186 ITERATION(srcData[2] + start, accum2);

	187 ITERATION(srcData[3] + start, accum3);

	188

	189 start += 16;

	190 filterValues += 4;

	191 }

	192

	193 int r = filterLength & 3;

	194 if (r) {

	195 int remainderOffset = (filterOffset + filterLength - r) * 4;

	196 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r);

	197 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r);

	198 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r);

	199 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r);

	200 }

	201

	202 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	203 accum0 = _mm_packs_epi32(accum0, zero);

	204 accum0 = _mm_packus_epi16(accum0, zero);

	205 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	206 accum1 = _mm_packs_epi32(accum1, zero);

	207 accum1 = _mm_packus_epi16(accum1, zero);

	208 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	209 accum2 = _mm_packs_epi32(accum2, zero);

	210 accum2 = _mm_packus_epi16(accum2, zero);

	211 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	212 accum3 = _mm_packs_epi32(accum3, zero);

	213 accum3 = _mm_packus_epi16(accum3, zero);

	214

	215 // We seem to be running off the edge here (chromium:491660).

	216 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes );

	217

	218 (reinterpret_cast<int>(outRow[0])) = _mm_cvtsi128_si32(accum0);

	219 (reinterpret_cast<int>(outRow[1])) = _mm_cvtsi128_si32(accum1);

	220 (reinterpret_cast<int>(outRow[2])) = _mm_cvtsi128_si32(accum2);

	221 (reinterpret_cast<int>(outRow[3])) = _mm_cvtsi128_si32(accum3);

	222

	223 outRow[0] += 4;

	224 outRow[1] += 4;

	225 outRow[2] += 4;

	226 outRow[3] += 4;

	227 }

	228 }

	229

	230 // Does vertical convolution to produce one output row. The filter values an d

	231 // length are given in the first two parameters. These are applied to each

	232 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	233 // being \|pixelWidth\| wide.

	234 //

	235 // The output must have room for \|pixelWidth * 4\| bytes.

	236 template<bool hasAlpha>

	237 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	238 int filterLength,

	239 unsigned char* const* sourceDataRows,

	240 int pixelWidth,

	241 unsigned char* outRow) {

	242 // Output four pixels per iteration (16 bytes).

	243 int width = pixelWidth & ~3;

	244 __m128i zero = _mm_setzero_si128();

	245 for (int outX = 0; outX < width; outX += 4) {

	246 // Accumulated result for each pixel. 32 bits per RGBA channel.

	247 __m128i accum0 = _mm_setzero_si128();

	248 __m128i accum1 = _mm_setzero_si128();

	249 __m128i accum2 = _mm_setzero_si128();

	250 __m128i accum3 = _mm_setzero_si128();

	251

	252 // Convolve with one filter coefficient per iteration.

	253 for (int filterY = 0; filterY < filterLength; filterY++) {

	254

	255 // Duplicate the filter coefficient 8 times.

	256 // [16] cj cj cj cj cj cj cj cj

	257 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);

	258

	259 // Load four pixels (16 bytes) together.

	260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	261 const __m128i* src = reinterpret_cast<const __m128i*>(

	262 &sourceDataRows[filterY][outX << 2]);

	263 __m128i src8 = _mm_loadu_si128(src);

	264

	265 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels =>

	266 // multiply with current coefficient => accumulate the result.

	267 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	268 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	269 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	270 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	271 // [32] a0 b0 g0 r0

	272 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	273 accum0 = _mm_add_epi32(accum0, t);

	274 // [32] a1 b1 g1 r1

	275 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	276 accum1 = _mm_add_epi32(accum1, t);

	277

	278 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels =>

	279 // multiply with current coefficient => accumulate the result.

	280 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	281 src16 = _mm_unpackhi_epi8(src8, zero);

	282 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	283 mul_lo = _mm_mullo_epi16(src16, coeff16);

	284 // [32] a2 b2 g2 r2

	285 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	286 accum2 = _mm_add_epi32(accum2, t);

	287 // [32] a3 b3 g3 r3

	288 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	289 accum3 = _mm_add_epi32(accum3, t);

	290 }

	291

	292 // Shift right for fixed point implementation.

	293 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	294 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	295 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	296 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	297

	298 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	299 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	300 accum0 = _mm_packs_epi32(accum0, accum1);

	301 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	302 accum2 = _mm_packs_epi32(accum2, accum3);

	303

	304 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	305 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	306 accum0 = _mm_packus_epi16(accum0, accum2);

	307

	308 if (hasAlpha) {

	309 // Compute the max(ri, gi, bi) for each pixel.

	310 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	311 __m128i a = _mm_srli_epi32(accum0, 8);

	312 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	313 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	314 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	315 a = _mm_srli_epi32(accum0, 16);

	316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	317 b = _mm_max_epu8(a, b); // Max of r and g and b.

	318 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	319 b = _mm_slli_epi32(b, 24);

	320

	321 // Make sure the value of alpha channel is always larger than ma ximum

	322 // value of color channels.

	323 accum0 = _mm_max_epu8(b, accum0);

	324 } else {

	325 // Set value of alpha channels to 0xFF.

	326 __m128i mask = _mm_set1_epi32(0xff000000);

	327 accum0 = _mm_or_si128(accum0, mask);

	328 }

	329

	330 // Store the convolution result (16 bytes) and advance the pixel poi nters.

	331 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);

	332 outRow += 16;

	333 }

	334

	335 // When the width of the output is not divisible by 4, We need to save o ne

	336 // pixel (4 bytes) each time. And also the fourth pixel is always absent .

	337 int r = pixelWidth & 3;

	338 if (r) {

	339 __m128i accum0 = _mm_setzero_si128();

	340 __m128i accum1 = _mm_setzero_si128();

	341 __m128i accum2 = _mm_setzero_si128();

	342 for (int filterY = 0; filterY < filterLength; ++filterY) {

	343 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);

	344 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	345 const __m128i* src = reinterpret_cast<const __m128i*>(

	346 &sourceDataRows[filterY][width << 2]);

	347 __m128i src8 = _mm_loadu_si128(src);

	348 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	349 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	350 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	351 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	352 // [32] a0 b0 g0 r0

	353 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	354 accum0 = _mm_add_epi32(accum0, t);

	355 // [32] a1 b1 g1 r1

	356 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	357 accum1 = _mm_add_epi32(accum1, t);

	358 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	359 src16 = _mm_unpackhi_epi8(src8, zero);

	360 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	361 mul_lo = _mm_mullo_epi16(src16, coeff16);

	362 // [32] a2 b2 g2 r2

	363 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	364 accum2 = _mm_add_epi32(accum2, t);

	365 }

	366

	367 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	368 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	369 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	370 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	371 accum0 = _mm_packs_epi32(accum0, accum1);

	372 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	373 accum2 = _mm_packs_epi32(accum2, zero);

	374 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	375 accum0 = _mm_packus_epi16(accum0, accum2);

	376 if (hasAlpha) {

	377 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	378 __m128i a = _mm_srli_epi32(accum0, 8);

	379 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	380 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	381 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	382 a = _mm_srli_epi32(accum0, 16);

	383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	384 b = _mm_max_epu8(a, b); // Max of r and g and b.

	385 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	386 b = _mm_slli_epi32(b, 24);

	387 accum0 = _mm_max_epu8(b, accum0);

	388 } else {

	389 __m128i mask = _mm_set1_epi32(0xff000000);

	390 accum0 = _mm_or_si128(accum0, mask);

	391 }

	392

	393 for (int i = 0; i < r; i++) {

	394 (reinterpret_cast<int>(outRow)) = _mm_cvtsi128_si32(accum0);

	395 accum0 = _mm_srli_si128(accum0, 4);

	396 outRow += 4;

	397 }

	398 }

	399 }

	400

	401 #elif defined(SK_ARM_HAS_NEON)

	402

	403 #include <arm_neon.h>
	mtklein_C 2016/11/16 14:24:47 Same deal with emmintrin. It's probably best to p Same deal with emmintrin. It's probably best to put the include outside the namespace. xiangze.zhang 2016/11/17 02:33:07 Done. Show quoted text On 2016/11/16 14:24:47, mtklein_C wrote: > Same deal with emmintrin. It's probably best to put the include outside the > namespace. Done.
	404

	405 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,

	406 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) {

	407 int remainder[4] = {0};

	408 for (int i = 0; i < r; i++) {

	409 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];

	410 remainder[0] += coeff * pixelsLeft[i * 4 + 0];

	411 remainder[1] += coeff * pixelsLeft[i * 4 + 1];

	412 remainder[2] += coeff * pixelsLeft[i * 4 + 2];

	413 remainder[3] += coeff * pixelsLeft[i * 4 + 3];

	414 }

	415 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};

	416 accum += t;

	417 }

	418

	419 // Convolves horizontally along a single row. The row data is given in

	420 // \|srcData\| and continues for the numValues() of the filter.

	421 void convolve_horizontally(const unsigned char* srcData,

	422 const SkConvolutionFilter1D& filter,

	423 unsigned char* outRow,

	424 bool /hasAlpha/) {

	425 // Loop over each pixel on this row in the output image.

	426 int numValues = filter.numValues();

	427 for (int outX = 0; outX < numValues; outX++) {

	428 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	429 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	430 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	431 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	432 // Get the filter that determines the current output pixel.

	433 int filterOffset, filterLength;

	434 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	435 filter.FilterForValue(outX, &filterOffset, &filterLength);

	436

	437 // Compute the first pixel in this row that the filter affects. It w ill

	438 // touch \|filterLength\| pixels (4 bytes each) after this.

	439 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	440

	441 // Apply the filter to the row to get the destination pixel in \|accu m\|.

	442 int32x4_t accum = vdupq_n_s32(0);

	443 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

	444 // Load 4 coefficients

	445 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	446 coeffs = vld1_s16(filterValues);

	447 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));

	448 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));

	449 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));

	450 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));

	451

	452 // Load pixels and calc

	453 uint8x16_t pixels = vld1q_u8(rowToFilter);

	454 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels)));

	455 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels)));

	456

	457 int16x4_t p0_src = vget_low_s16(p01_16);

	458 int16x4_t p1_src = vget_high_s16(p01_16);

	459 int16x4_t p2_src = vget_low_s16(p23_16);

	460 int16x4_t p3_src = vget_high_s16(p23_16);

	461

	462 int32x4_t p0 = vmull_s16(p0_src, coeff0);

	463 int32x4_t p1 = vmull_s16(p1_src, coeff1);

	464 int32x4_t p2 = vmull_s16(p2_src, coeff2);

	465 int32x4_t p3 = vmull_s16(p3_src, coeff3);

	466

	467 accum += p0;

	468 accum += p1;

	469 accum += p2;

	470 accum += p3;

	471

	472 // Advance the pointers

	473 rowToFilter += 16;

	474 filterValues += 4;

	475 }

	476

	477 int r = filterLength & 3;

	478 if (r) {

	479 int remainder_offset = (filterOffset + filterLength - r) * 4;

	480 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);

	481 }

	482

	483 // Bring this value back in range. All of the filter scaling factors

	484 // are in fixed point with kShiftBits bits of fractional part.

	485 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

	486

	487 // Pack and store the new pixel.

	488 int16x4_t accum16 = vqmovn_s32(accum);

	489 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

	490 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0);

	491 outRow += 4;

	492 }

	493 }

	494

	495 // Convolves horizontally along four rows. The row data is given in

	496 // \|srcData\| and continues for the numValues() of the filter.

	497 // The algorithm is almost same as \|convolve_horizontally\|. Please

	498 // refer to that function for detailed comments.

	499 void convolve_4_rows_horizontally(const unsigned char* srcData[4],

	500 const SkConvolutionFilter1D& filter,

	501 unsigned char* outRow[4],

	502 size_t outRowBytes) {

	503 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.

	504 int numValues = filter.numValues();

	505 for (int outX = 0; outX < numValues; outX++) {

	506

	507 int filterOffset, filterLength;

	508 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	509 filter.FilterForValue(outX, &filterOffset, &filterLength);

	510

	511 // four pixels in a column per iteration.

	512 int32x4_t accum0 = vdupq_n_s32(0);

	513 int32x4_t accum1 = vdupq_n_s32(0);

	514 int32x4_t accum2 = vdupq_n_s32(0);

	515 int32x4_t accum3 = vdupq_n_s32(0);

	516

	517 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	518 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	519 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	520 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	521

	522 int start = filterOffset * 4;

	523

	524 // We will load and accumulate with four coefficients per iteration.

	525 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {

	526 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	527

	528 coeffs = vld1_s16(filterValues);

	529 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));

	530 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));

	531 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));

	532 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));

	533

	534 uint8x16_t pixels;

	535 int16x8_t p01_16, p23_16;

	536 int32x4_t p0, p1, p2, p3;

	537

	538 #define ITERATION(src, accum) \

	539 pixels = vld1q_u8(src); \

	540 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \

	541 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \

	542 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \

	543 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \

	544 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \

	545 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \

	546 accum += p0; \

	547 accum += p1; \

	548 accum += p2; \

	549 accum += p3

	550

	551 ITERATION(srcData[0] + start, accum0);

	552 ITERATION(srcData[1] + start, accum1);

	553 ITERATION(srcData[2] + start, accum2);

	554 ITERATION(srcData[3] + start, accum3);

	555

	556 start += 16;

	557 filterValues += 4;

	558 }

	559

	560 int r = filterLength & 3;

	561 if (r) {

	562 int remainder_offset = (filterOffset + filterLength - r) * 4;

	563 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r);

	564 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r);

	565 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r);

	566 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r);

	567 }

	568

	569 int16x4_t accum16;

	570 uint8x8_t res0, res1, res2, res3;

	571

	572 #define PACK_RESULT(accum, res) \

	573 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \

	574 accum16 = vqmovn_s32(accum); \

	575 res = vqmovun_s16(vcombine_s16(accum16, accum16));

	576

	577 PACK_RESULT(accum0, res0);

	578 PACK_RESULT(accum1, res1);

	579 PACK_RESULT(accum2, res2);

	580 PACK_RESULT(accum3, res3);

	581

	582 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0);

	583 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0);

	584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0);

	585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0);

	586 outRow[0] += 4;

	587 outRow[1] += 4;

	588 outRow[2] += 4;

	589 outRow[3] += 4;

	590 }

	591 }

	592

	593

	594 // Does vertical convolution to produce one output row. The filter values an d

	595 // length are given in the first two parameters. These are applied to each

	596 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	597 // being \|pixelWidth\| wide.

	598 //

	599 // The output must have room for \|pixelWidth * 4\| bytes.

	600 template<bool hasAlpha>

	601 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	602 int filterLength,

	603 unsigned char* const* sourceDataRows,

	604 int pixelWidth,

	605 unsigned char* outRow) {

	606 int width = pixelWidth & ~3;

	607

	608 // Output four pixels per iteration (16 bytes).

	609 for (int outX = 0; outX < width; outX += 4) {

	610

	611 // Accumulated result for each pixel. 32 bits per RGBA channel.

	612 int32x4_t accum0 = vdupq_n_s32(0);

	613 int32x4_t accum1 = vdupq_n_s32(0);

	614 int32x4_t accum2 = vdupq_n_s32(0);

	615 int32x4_t accum3 = vdupq_n_s32(0);

	616

	617 // Convolve with one filter coefficient per iteration.

	618 for (int filterY = 0; filterY < filterLength; filterY++) {

	619

	620 // Duplicate the filter coefficient 4 times.

	621 // [16] cj cj cj cj

	622 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);

	623

	624 // Load four pixels (16 bytes) together.

	625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	626 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);

	627

	628 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));

	629 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));

	630 int16x4_t src16_0 = vget_low_s16(src16_01);

	631 int16x4_t src16_1 = vget_high_s16(src16_01);

	632 int16x4_t src16_2 = vget_low_s16(src16_23);

	633 int16x4_t src16_3 = vget_high_s16(src16_23);

	634

	635 accum0 += vmull_s16(src16_0, coeff16);

	636 accum1 += vmull_s16(src16_1, coeff16);

	637 accum2 += vmull_s16(src16_2, coeff16);

	638 accum3 += vmull_s16(src16_3, coeff16);

	639 }

	640

	641 // Shift right for fixed point implementation.

	642 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	643 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	644 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	645 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);

	646

	647 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	648 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	649 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));

	650 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	651 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3));

	652

	653 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	654 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	655 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));

	656

	657 if (hasAlpha) {

	658 // Compute the max(ri, gi, bi) for each pixel.

	659 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	660 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));

	661 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	662 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	663 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	664 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));

	665 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	666 b = vmaxq_u8(a, b); // Max of r and g and b.

	667 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	668 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));

	669

	670 // Make sure the value of alpha channel is always larger than ma ximum

	671 // value of color channels.

	672 accum8 = vmaxq_u8(b, accum8);

	673 } else {

	674 // Set value of alpha channels to 0xFF.

	675 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdu pq_n_u32(0xFF000000));

	676 }

	677

	678 // Store the convolution result (16 bytes) and advance the pixel poi nters.

	679 vst1q_u8(outRow, accum8);

	680 outRow += 16;

	681 }

	682

	683 // Process the leftovers when the width of the output is not divisible

	684 // by 4, that is at most 3 pixels.

	685 int r = pixelWidth & 3;

	686 if (r) {

	687

	688 int32x4_t accum0 = vdupq_n_s32(0);

	689 int32x4_t accum1 = vdupq_n_s32(0);

	690 int32x4_t accum2 = vdupq_n_s32(0);

	691

	692 for (int filterY = 0; filterY < filterLength; ++filterY) {

	693 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);

	694

	695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	696 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ;

	697

	698 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));

	699 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));

	700 int16x4_t src16_0 = vget_low_s16(src16_01);

	701 int16x4_t src16_1 = vget_high_s16(src16_01);

	702 int16x4_t src16_2 = vget_low_s16(src16_23);

	703

	704 accum0 += vmull_s16(src16_0, coeff16);

	705 accum1 += vmull_s16(src16_1, coeff16);

	706 accum2 += vmull_s16(src16_2, coeff16);

	707 }

	708

	709 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	710 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	711 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	712

	713 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));

	714 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2));

	715

	716 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));

	717

	718 if (hasAlpha) {

	719 // Compute the max(ri, gi, bi) for each pixel.

	720 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	721 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));

	722 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	723 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	724 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	725 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));

	726 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	727 b = vmaxq_u8(a, b); // Max of r and g and b.

	728 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	729 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));

	730

	731 // Make sure the value of alpha channel is always larger than ma ximum

	732 // value of color channels.

	733 accum8 = vmaxq_u8(b, accum8);

	734 } else {

	735 // Set value of alpha channels to 0xFF.

	736 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdu pq_n_u32(0xFF000000));

	737 }

	738

	739 switch(r) {

	740 case 1:

	741 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0);

	742 break;

	743 case 2:

	744 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	745 vreinterpret_u32_u8(vget_low_u8(accum8)));

	746 break;

	747 case 3:

	748 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	749 vreinterpret_u32_u8(vget_low_u8(accum8)));

	750 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2);

	751 break;

	752 }

	753 }

	754 }

	755

	756 #else

	757

	758 // Converts the argument to an 8-bit unsigned value by clamping to the range

	759 // 0-255.

	760 inline unsigned char ClampTo8(int a) {

	761 if (static_cast<unsigned>(a) < 256) {

	762 return a; // Avoid the extra check in the common case.

	763 }

	764 if (a < 0) {

	765 return 0;

	766 }

	767 return 255;

	768 }

	769

	770 // Convolves horizontally along a single row. The row data is given in

	771 // \|srcData\| and continues for the numValues() of the filter.

	772 template<bool hasAlpha>

	773 void ConvolveHorizontally(const unsigned char* srcData,

	774 const SkConvolutionFilter1D& filter,

	775 unsigned char* outRow) {

	776 // Loop over each pixel on this row in the output image.

	777 int numValues = filter.numValues();

	778 for (int outX = 0; outX < numValues; outX++) {

	779 // Get the filter that determines the current output pixel.

	780 int filterOffset, filterLength;

	781 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	782 filter.FilterForValue(outX, &filterOffset, &filterLength);

	783

	784 // Compute the first pixel in this row that the filter affects. It w ill

	785 // touch \|filterLength\| pixels (4 bytes each) after this.

	786 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	787

	788 // Apply the filter to the row to get the destination pixel in \|accu m\|.

	789 int accum[4] = {0};

	790 for (int filterX = 0; filterX < filterLength; filterX++) {

	791 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX];

	792 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];

	793 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];

	794 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];

	795 if (hasAlpha) {

	796 accum[3] += curFilter * rowToFilter[filterX * 4 + 3];

	797 }

	798 }

	799

	800 // Bring this value back in range. All of the filter scaling factors

	801 // are in fixed point with kShiftBits bits of fractional part.

	802 accum[0] >>= SkConvolutionFilter1D::kShiftBits;

	803 accum[1] >>= SkConvolutionFilter1D::kShiftBits;

	804 accum[2] >>= SkConvolutionFilter1D::kShiftBits;

	805 if (hasAlpha) {

	806 accum[3] >>= SkConvolutionFilter1D::kShiftBits;

	807 }

	808

	809 // Store the new pixel.

	810 outRow[outX * 4 + 0] = ClampTo8(accum[0]);

	811 outRow[outX * 4 + 1] = ClampTo8(accum[1]);

	812 outRow[outX * 4 + 2] = ClampTo8(accum[2]);

	813 if (hasAlpha) {

	814 outRow[outX * 4 + 3] = ClampTo8(accum[3]);

	815 }

	816 }

	817 }

	818

	819 // Does vertical convolution to produce one output row. The filter values an d

	820 // length are given in the first two parameters. These are applied to each

	821 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	822 // being \|pixelWidth\| wide.

	823 //

	824 // The output must have room for \|pixelWidth * 4\| bytes.

	825 template<bool hasAlpha>

	826 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	827 int filterLength,

	828 unsigned char* const* sourceDataRows,

	829 int pixelWidth,

	830 unsigned char* outRow) {

	831 // We go through each column in the output and do a vertical convolution ,

	832 // generating one output pixel each time.

	833 for (int outX = 0; outX < pixelWidth; outX++) {

	834 // Compute the number of bytes over in each row that the current col umn

	835 // we're convolving starts at. The pixel will cover the next 4 bytes .

	836 int byteOffset = outX * 4;

	837

	838 // Apply the filter to one column of pixels.

	839 int accum[4] = {0};

	840 for (int filterY = 0; filterY < filterLength; filterY++) {

	841 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY];

	842 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];

	843 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];

	844 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];

	845 if (hasAlpha) {

	846 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];

	847 }

	848 }

	849

	850 // Bring this value back in range. All of the filter scaling factors

	851 // are in fixed point with kShiftBits bits of precision.

	852 accum[0] >>= SkConvolutionFilter1D::kShiftBits;

	853 accum[1] >>= SkConvolutionFilter1D::kShiftBits;

	854 accum[2] >>= SkConvolutionFilter1D::kShiftBits;

	855 if (hasAlpha) {

	856 accum[3] >>= SkConvolutionFilter1D::kShiftBits;

	857 }

	858

	859 // Store the new pixel.

	860 outRow[byteOffset + 0] = ClampTo8(accum[0]);

	861 outRow[byteOffset + 1] = ClampTo8(accum[1]);

	862 outRow[byteOffset + 2] = ClampTo8(accum[2]);

	863 if (hasAlpha) {

	864 unsigned char alpha = ClampTo8(accum[3]);

	865

	866 // Make sure the alpha channel doesn't come out smaller than any of the

	867 // color channels. We use premultipled alpha channels, so this s hould

	868 // never happen, but rounding errors will cause this from time t o time.

	869 // These "impossible" colors will cause overflows (and hence ran dom pixel

	870 // values) when the resulting bitmap is drawn to the screen.

	871 //

	872 // We only need to do this when generating the final output row (here).

	873 int maxColorChannel = SkTMax(outRow[byteOffset + 0],

	874 SkTMax(outRow[byteOffset + 1],

	875 outRow[byteOffset + 2]));

	876 if (alpha < maxColorChannel) {

	877 outRow[byteOffset + 3] = maxColorChannel;

	878 } else {

	879 outRow[byteOffset + 3] = alpha;

	880 }

	881 } else {

	882 // No alpha channel, the image is opaque.

	883 outRow[byteOffset + 3] = 0xff;

	884 }

	885 }

	886 }

	887

	888 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally

	889 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles

	890 // suffer here too.

	891 //

	892 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575

	893 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)

	894 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline))

	895 #else

	896 #define SK_MAYBE_DISABLE_VECTORIZATION

	897 #endif

	898

	899 SK_MAYBE_DISABLE_VECTORIZATION

	900 void convolve_horizontally(const unsigned char* srcData,

	901 const SkConvolutionFilter1D& filter,

	902 unsigned char* outRow,

	903 bool has_alpha) {

	904 if (has_alpha) {

	905 ConvolveHorizontally<true>(srcData, filter, outRow);

	906 } else {

	907 ConvolveHorizontally<false>(srcData, filter, outRow);

	908 }

	909 }

	910 #undef SK_MAYBE_DISABLE_VECTORIZATION

	911

	912 void (convolve_4_rows_horizontally)(const unsigned char src_data[4],
	mtklein_C 2016/11/16 14:24:47 Do you think it'd make the calling code clearer to Do you think it'd make the calling code clearer to just implement this in terms of convolve_horizontally? (Maybe that's best left as a follow up.) xiangze.zhang 2016/11/17 02:33:07 The calling code can check and allocate a smaller Show quoted text On 2016/11/16 14:24:47, mtklein_C wrote: > Do you think it'd make the calling code clearer to just implement this in terms > of convolve_horizontally? (Maybe that's best left as a follow up.) The calling code can check and allocate a smaller row buffer when convolve_4_rows_horizontally is nullptr. So I'd like to not change it for now.
	913 const SkConvolutionFilter1D& filter,

	914 unsigned char* out_row[4],

	915 size_t out_row_bytes)

	916 = nullptr;

	917

	918

	919 #endif

	920

	921 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,

	922 int filter_length,

	923 unsigned char* const* source_data_rows,

	924 int pixel_width,

	925 unsigned char* out_row,

	926 bool has_alpha) {

	927 if (has_alpha) {

	928 convolveVertically<true>(filter_values, filter_length, source_data_r ows,

	929 pixel_width, out_row);

	930 } else {

	931 convolveVertically<false>(filter_values, filter_length, source_data_ rows,

	932 pixel_width, out_row);

	933 }

	934 }

	935

	936 } // namespace SK_OPTS_NS

	937

	938 #endif//SkBitmapFilter_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »