src/opts/SkBitmapProcState_arm_neon.cpp - Issue 27533004: ARM Skia NEON patches - 33 - Convolution filter

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 27533004: ARM Skia NEON patches - 33 - Convolution filter (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Remove the unused variable Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1	1

2 /*	2 /*

3 * Copyright 2012 Google Inc.	3 * Copyright 2012 Google Inc.

4 *	4 *

5 * Use of this source code is governed by a BSD-style license that can be	5 * Use of this source code is governed by a BSD-style license that can be

6 * found in the LICENSE file.	6 * found in the LICENSE file.

7 */	7 */

8 #include "SkBitmapProcState.h"	8 #include "SkBitmapProcState.h"

9 #include "SkBitmapProcState_filter.h"	9 #include "SkBitmapProcState_filter.h"

10 #include "SkColorPriv.h"	10 #include "SkColorPriv.h"

(...skipping 72 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
83 SI8_D16_nofilter_DXDY_neon,	83 SI8_D16_nofilter_DXDY_neon,

84 SI8_D16_nofilter_DX_neon,	84 SI8_D16_nofilter_DX_neon,

85 SI8_D16_filter_DXDY_neon,	85 SI8_D16_filter_DXDY_neon,

86 SI8_D16_filter_DX_neon,	86 SI8_D16_filter_DX_neon,

87	87

88 // Don't support 4444 -> 565	88 // Don't support 4444 -> 565

89 NULL, NULL, NULL, NULL,	89 NULL, NULL, NULL, NULL,

90 // Don't support A8 -> 565	90 // Don't support A8 -> 565

91 NULL, NULL, NULL, NULL	91 NULL, NULL, NULL, NULL

92 };	92 };

	93

	94 ///////////////////////////////////////////////////////////////////////////////

	95

	96 #include <arm_neon.h>

	97 #include "SkConvolver.h"

	98

	99 // Convolves horizontally along a single row. The row data is given in

	100 // \|srcData\| and continues for the numValues() of the filter.

	101 void convolveHorizontally_neon(const unsigned char* srcData,

	102 const SkConvolutionFilter1D& filter,

	103 unsigned char* outRow,

	104 bool hasAlpha) {

	105 // Loop over each pixel on this row in the output image.

	106 int numValues = filter.numValues();

	107 for (int outX = 0; outX < numValues; outX++) {

	108 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	109 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	110 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	111 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	112 // Get the filter that determines the current output pixel.

	113 int filterOffset, filterLength;

	114 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	115 filter.FilterForValue(outX, &filterOffset, &filterLength);

	116

	117 // Compute the first pixel in this row that the filter affects. It will

	118 // touch \|filterLength\| pixels (4 bytes each) after this.

	119 const unsigned char* rowToFilter = &srcData[filterOffset * 4];

	120

	121 // Apply the filter to the row to get the destination pixel in \|accum\|.

	122 int32x4_t accum = vdupq_n_s32(0);

	123 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

	124 // Load 4 coefficients

	125 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	126 coeffs = vld1_s16(filterValues);

	127 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));

	128 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));

	129 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));

	130 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));

	131

	132 // Load pixels and calc

	133 uint8x16_t pixels = vld1q_u8(rowToFilter);

	134 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));

	135 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));

	136

	137 int16x4_t p0_src = vget_low_s16(p01_16);

	138 int16x4_t p1_src = vget_high_s16(p01_16);

	139 int16x4_t p2_src = vget_low_s16(p23_16);

	140 int16x4_t p3_src = vget_high_s16(p23_16);

	141

	142 int32x4_t p0 = vmull_s16(p0_src, coeff0);

	143 int32x4_t p1 = vmull_s16(p1_src, coeff1);

	144 int32x4_t p2 = vmull_s16(p2_src, coeff2);

	145 int32x4_t p3 = vmull_s16(p3_src, coeff3);

	146

	147 accum += p0;

	148 accum += p1;

	149 accum += p2;

	150 accum += p3;

	151

	152 // Advance the pointers

	153 rowToFilter += 16;

	154 filterValues += 4;

	155 }

	156 int r = filterLength & 3;

	157 if (r) {

	158 const uint16_t mask[4][4] = {

	159 {0, 0, 0, 0},

	160 {0xFFFF, 0, 0, 0},

	161 {0xFFFF, 0xFFFF, 0, 0},

	162 {0xFFFF, 0xFFFF, 0xFFFF, 0}

	163 };

	164 uint16x4_t coeffs;

	165 int16x4_t coeff0, coeff1, coeff2;

	166 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));

	167 coeffs &= vld1_u16(&mask[r][0]);

	168 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask0));

	169 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask1));

	170 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask2));

	171

	172 // Load pixels and calc

	173 uint8x16_t pixels = vld1q_u8(rowToFilter);

	174 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));

	175 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));

	176 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);

	177 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);

	178 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);

	179

	180 accum += p0;

	181 accum += p1;

	182 accum += p2;

	183 }

	184

	185 // Bring this value back in range. All of the filter scaling factors

	186 // are in fixed point with kShiftBits bits of fractional part.

	187 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

	188

	189 // Pack and store the new pixel.

	190 int16x4_t accum16 = vqmovn_s32(accum);

	191 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

	192 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);

	193 outRow += 4;

	194 }

	195 }

	196

	197 // Does vertical convolution to produce one output row. The filter values and

	198 // length are given in the first two parameters. These are applied to each

	199 // of the rows pointed to in the \|sourceDataRows\| array, with each row

	200 // being \|pixelWidth\| wide.

	201 //

	202 // The output must have room for \|pixelWidth * 4\| bytes.

	203 template<bool hasAlpha>

	204 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,

	205 int filterLength,

	206 unsigned char* const* sourceDataRows,

	207 int pixelWidth,

	208 unsigned char* outRow) {

	209 int width = pixelWidth & ~3;

	210

	211 int32x4_t accum0, accum1, accum2, accum3;

	212 int16x4_t coeff16;

	213

	214 // Output four pixels per iteration (16 bytes).

	215 for (int outX = 0; outX < width; outX += 4) {

	216

	217 // Accumulated result for each pixel. 32 bits per RGBA channel.

	218 accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);

	219

	220 // Convolve with one filter coefficient per iteration.

	221 for (int filterY = 0; filterY < filterLength; filterY++) {

	222

	223 // Duplicate the filter coefficient 4 times.

	224 // [16] cj cj cj cj

	225 coeff16 = vdup_n_s16(filterValues[filterY]);

	226

	227 // Load four pixels (16 bytes) together.

	228 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	229 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);

	230

	231 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8 )));

	232 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src 8)));

	233 int16x4_t src16_0 = vget_low_s16(src16_01);

	234 int16x4_t src16_1 = vget_high_s16(src16_01);

	235 int16x4_t src16_2 = vget_low_s16(src16_23);

	236 int16x4_t src16_3 = vget_high_s16(src16_23);

	237

	238 accum0 += vmull_s16(src16_0, coeff16);

	239 accum1 += vmull_s16(src16_1, coeff16);

	240 accum2 += vmull_s16(src16_2, coeff16);

	241 accum3 += vmull_s16(src16_3, coeff16);

	242 }

	243

	244 // Shift right for fixed point implementation.

	245 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	246 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	247 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	248 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);

	249

	250 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	251 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	252 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1 ));

	253 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	254 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3 ));

	255

	256 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	257 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	258 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu m16_1));

	259

	260 if (hasAlpha) {

	261 // Compute the max(ri, gi, bi) for each pixel.

	262 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	263 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8 (accum8), 8));

	264 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	265 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	266 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	267 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1 6));

	268 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	269 b = vmaxq_u8(a, b); // Max of r and g and b.

	270 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	271 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

	272

	273 // Make sure the value of alpha channel is always larger than maximu m

	274 // value of color channels.

	275 accum8 = vmaxq_u8(b, accum8);

	276 } else {

	277 // Set value of alpha channels to 0xFF.

	278 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdupq_n _u32(0xFF000000));

	279 }

	280

	281 // Store the convolution result (16 bytes) and advance the pixel pointer s.

	282 vst1q_u8(outRow, accum8);

	283 outRow += 16;

	284 }

	285

	286 // Process the leftovers when the width of the output is not divisible

	287 // by 4, that is at most 3 pixels.

	288 int r = pixelWidth & 3;

	289 if (r) {

	290

	291 accum0 = accum1 = accum2 = vdupq_n_s32(0);

	292

	293 for (int filterY = 0; filterY < filterLength; ++filterY) {

	294 coeff16 = vdup_n_s16(filterValues[filterY]);

	295

	296 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	297 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);

	298

	299 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8 )));

	300 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src 8)));

	301 int16x4_t src16_0 = vget_low_s16(src16_01);

	302 int16x4_t src16_1 = vget_high_s16(src16_01);

	303 int16x4_t src16_2 = vget_low_s16(src16_23);

	304

	305 accum0 += vmull_s16(src16_0, coeff16);

	306 accum1 += vmull_s16(src16_1, coeff16);

	307 accum2 += vmull_s16(src16_2, coeff16);

	308 }

	309

	310 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

	311 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

	312 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

	313

	314 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1 ));

	315 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2 ));

	316

	317 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu m16_1));

	318

	319 if (hasAlpha) {

	320 // Compute the max(ri, gi, bi) for each pixel.

	321 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	322 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8 (accum8), 8));

	323 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	324 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

	325 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	326 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1 6));

	327 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	328 b = vmaxq_u8(a, b); // Max of r and g and b.

	329 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	330 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

	331

	332 // Make sure the value of alpha channel is always larger than maximu m

	333 // value of color channels.

	334 accum8 = vmaxq_u8(b, accum8);

	335 } else {

	336 // Set value of alpha channels to 0xFF.

	337 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \| vdupq_n _u32(0xFF000000));

	338 }

	339

	340 switch(r) {

	341 case 1:

	342 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u3 2_u8(accum8), 0);

	343 break;

	344 case 2:

	345 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	346 vreinterpret_u32_u8(vget_low_u8(accum8)));

	347 break;

	348 case 3:

	349 vst1_u32(reinterpret_cast<uint32_t*>(outRow),

	350 vreinterpret_u32_u8(vget_low_u8(accum8)));

	351 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_ u32_u8(accum8), 2);

	352 break;

	353 }

	354 }

	355 }

	356

	357 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,

	358 int filterLength,

	359 unsigned char* const* sourceDataRows,

	360 int pixelWidth,

	361 unsigned char* outRow,

	362 bool sourceHasAlpha) {

	363 if (sourceHasAlpha) {

	364 convolveVertically_neon<true>(filterValues, filterLength,

	365 sourceDataRows, pixelWidth,

	366 outRow);

	367 } else {

	368 convolveVertically_neon<false>(filterValues, filterLength,

	369 sourceDataRows, pixelWidth,

	370 outRow);

	371 }

	372 }

	373

	374 // Convolves horizontally along four rows. The row data is given in

	375 // \|src_data\| and continues for the num_values() of the filter.

	376 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

	377 // refer to that function for detailed comments.

	378 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],

	379 const SkConvolutionFilter1D& filter,

	380 unsigned char* outRow[4]) {

	381

	382 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

	383 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

	384 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

	385 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

	386 int num_values = filter.numValues();

	387

	388 int filterOffset, filterLength;

	389 // \|mask\| will be used to decimate all extra filter coefficients that are

	390 // loaded by SIMD when \|filter_length\| is not divisible by 4.

	391 // mask[0] is not used in following algorithm.

	392 const uint16_t mask[4][4] = {

	393 {0, 0, 0, 0},

	394 {0xFFFF, 0, 0, 0},

	395 {0xFFFF, 0xFFFF, 0, 0},

	396 {0xFFFF, 0xFFFF, 0xFFFF, 0}

	397 };

	398

	399 // Output one pixel each iteration, calculating all channels (RGBA) together .

	400 for (int outX = 0; outX < num_values; outX++) {

	401

	402 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

	403 filter.FilterForValue(outX, &filterOffset, &filterLength);

	404

	405 // four pixels in a column per iteration.

	406 int32x4_t accum0 = vdupq_n_s32(0);

	407 int32x4_t accum1 = vdupq_n_s32(0);

	408 int32x4_t accum2 = vdupq_n_s32(0);

	409 int32x4_t accum3 = vdupq_n_s32(0);

	410

	411 int start = (filterOffset<<2);

	412

	413 // We will load and accumulate with four coefficients per iteration.

	414 for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {

	415 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	416

	417 coeffs = vld1_s16(filterValues);

	418 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));

	419 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));

	420 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));

	421 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));

	422

	423 uint8x16_t pixels;

	424 int16x8_t p01_16, p23_16;

	425 int32x4_t p0, p1, p2, p3;

	426

	427

	428 #define ITERATION(src, accum) \

	429 pixels = vld1q_u8(src); \

	430 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \

	431 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \

	432 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \

	433 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \

	434 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \

	435 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \

	436 accum += p0; \

	437 accum += p1; \

	438 accum += p2; \

	439 accum += p3

	440

	441 ITERATION(srcData[0] + start, accum0);

	442 ITERATION(srcData[1] + start, accum1);

	443 ITERATION(srcData[2] + start, accum2);

	444 ITERATION(srcData[3] + start, accum3);

	445

	446 start += 16;

	447 filterValues += 4;

	448 }

	449

	450 int r = filterLength & 3;

	451 if (r) {

	452 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

	453 coeffs = vld1_s16(filterValues);

	454 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));

	455 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));

	456 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));

	457 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));

	458 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));

	459

	460 uint8x16_t pixels;

	461 int16x8_t p01_16, p23_16;

	462 int32x4_t p0, p1, p2, p3;

	463

	464 ITERATION(srcData[0] + start, accum0);

	465 ITERATION(srcData[1] + start, accum1);

	466 ITERATION(srcData[2] + start, accum2);

	467 ITERATION(srcData[3] + start, accum3);

	468 }

	469

	470 int16x4_t accum16;

	471 uint8x8_t res0, res1, res2, res3;

	472

	473 #define PACK_RESULT(accum, res) \

	474 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \

	475 accum16 = vqmovn_s32(accum); \

	476 res = vqmovun_s16(vcombine_s16(accum16, accum16));

	477

	478 PACK_RESULT(accum0, res0);

	479 PACK_RESULT(accum1, res1);

	480 PACK_RESULT(accum2, res2);

	481 PACK_RESULT(accum3, res3);

	482

	483 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);

	484 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);

	485 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);

	486 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);

	487 outRow[0] += 4;

	488 outRow[1] += 4;

	489 outRow[2] += 4;

	490 outRow[3] += 4;

	491 }

	492 }

	493

	494 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {

	495 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

	496 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

	497 // together to access invalid memory areas. We are not trying to align the

	498 // coefficients right now due to the opaqueness of <vector> implementation.

	499 // This has to be done after all \|AddFilter\| calls.

	500 for (int i = 0; i < 8; ++i) {

	501 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

	502 }

	503 }

	504

	505 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {

	506 procs->fExtraHorizontalReads = 3;

	507 procs->fConvolveVertically = &convolveVertically_neon;

	508 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;

	509 procs->fConvolveHorizontally = &convolveHorizontally_neon;

	510 procs->fApplySIMDPadding = &applySIMDPadding_neon;

	511 }

	512

OLD	NEW

« no previous file with comments | « no previous file | src/opts/SkBitmapProcState_opts_arm.cpp » ('j') | no next file with comments »