src/opts/SkBitmapProcState_arm_neon.cpp - Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)

Patch Set: Change macros to functions Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 Google Inc.	2 * Copyright 2012 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBitmapProcState.h"	8 #include "SkBitmapProcState.h"

9 #include "SkBitmapProcState_filter.h"	9 #include "SkBitmapProcState_filter.h"

10 #include "SkColorPriv.h"	10 #include "SkColorPriv.h"

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
76 SG8_alpha_D32_filter_DXDY_neon,	76 SG8_alpha_D32_filter_DXDY_neon,

77 SG8_alpha_D32_filter_DX_neon,	77 SG8_alpha_D32_filter_DX_neon,

78 SG8_alpha_D32_filter_DX_neon,	78 SG8_alpha_D32_filter_DX_neon,

79 };	79 };

80	80

81 ///////////////////////////////////////////////////////////////////////////////	81 ///////////////////////////////////////////////////////////////////////////////

82	82

83 #include <arm_neon.h>	83 #include <arm_neon.h>

84 #include "SkConvolver.h"	84 #include "SkConvolver.h"

85	85

	86 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,

	87 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) {

	88 int remainder[4] = {0};

	89 for (int i = 0; i < r; i++) {

	90 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];

	91 remainder[0] += coeff * pixels_left[i * 4 + 0];

	92 remainder[1] += coeff * pixels_left[i * 4 + 1];

	93 remainder[2] += coeff * pixels_left[i * 4 + 2];

	94 remainder[3] += coeff * pixels_left[i * 4 + 3];

	95 }

	96 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};

	97 accum += t;

	98 }

	99

86 // Convolves horizontally along a single row. The row data is given in	100 // Convolves horizontally along a single row. The row data is given in

87 // \|srcData\| and continues for the numValues() of the filter.	101 // \|srcData\| and continues for the numValues() of the filter.

88 void convolveHorizontally_neon(const unsigned char* srcData,	102 void convolveHorizontally_neon(const unsigned char* srcData,

89 const SkConvolutionFilter1D& filter,	103 const SkConvolutionFilter1D& filter,

90 unsigned char* outRow,	104 unsigned char* outRow,

91 bool hasAlpha) {	105 bool hasAlpha) {

92 // Loop over each pixel on this row in the output image.	106 // Loop over each pixel on this row in the output image.

93 int numValues = filter.numValues();	107 int numValues = filter.numValues();

94 for (int outX = 0; outX < numValues; outX++) {	108 for (int outX = 0; outX < numValues; outX++) {

95 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);	109 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133	147

134 accum += p0;	148 accum += p0;

135 accum += p1;	149 accum += p1;

136 accum += p2;	150 accum += p2;

137 accum += p3;	151 accum += p3;

138	152

139 // Advance the pointers	153 // Advance the pointers

140 rowToFilter += 16;	154 rowToFilter += 16;

141 filterValues += 4;	155 filterValues += 4;

142 }	156 }

	157

143 int r = filterLength & 3;	158 int r = filterLength & 3;

144 if (r) {	159 if (r) {

145 const uint16_t mask[4][4] = {	160 int remainder_offset = (filterOffset + filterLength - r) * 4;

146 {0, 0, 0, 0},	161 accum_remainder(srcData + remainder_offset, filterValues, accum, r);

147 {0xFFFF, 0, 0, 0},

148 {0xFFFF, 0xFFFF, 0, 0},

149 {0xFFFF, 0xFFFF, 0xFFFF, 0}

150 };

151 uint16x4_t coeffs;

152 int16x4_t coeff0, coeff1, coeff2;

153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));

154 coeffs &= vld1_u16(&mask[r][0]);

155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask0));

156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask1));

157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask2));

158

159 // Load pixels and calc

160 uint8x16_t pixels = vld1q_u8(rowToFilter);

161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));

162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));

163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);

164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);

165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);

166

167 accum += p0;

168 accum += p1;

169 accum += p2;

170 }	162 }

171	163

172 // Bring this value back in range. All of the filter scaling factors	164 // Bring this value back in range. All of the filter scaling factors

173 // are in fixed point with kShiftBits bits of fractional part.	165 // are in fixed point with kShiftBits bits of fractional part.

174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);	166 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

175	167

176 // Pack and store the new pixel.	168 // Pack and store the new pixel.

177 int16x4_t accum16 = vqmovn_s32(accum);	169 int16x4_t accum16 = vqmovn_s32(accum);

178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));	170 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);	171 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);

(...skipping 187 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
367 unsigned char* outRow[4],	359 unsigned char* outRow[4],

368 size_t outRowBytes) {	360 size_t outRowBytes) {

369	361

370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);	362 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);	363 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);	364 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);	365 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

374 int num_values = filter.numValues();	366 int num_values = filter.numValues();

375	367

376 int filterOffset, filterLength;	368 int filterOffset, filterLength;

377 // \|mask\| will be used to decimate all extra filter coefficients that are

378 // loaded by SIMD when \|filter_length\| is not divisible by 4.

379 // mask[0] is not used in following algorithm.

380 const uint16_t mask[4][4] = {

381 {0, 0, 0, 0},

382 {0xFFFF, 0, 0, 0},

383 {0xFFFF, 0xFFFF, 0, 0},

384 {0xFFFF, 0xFFFF, 0xFFFF, 0}

385 };

386	369

387 // Output one pixel each iteration, calculating all channels (RGBA) together .	370 // Output one pixel each iteration, calculating all channels (RGBA) together .

388 for (int outX = 0; outX < num_values; outX++) {	371 for (int outX = 0; outX < num_values; outX++) {

389	372

390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =	373 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

391 filter.FilterForValue(outX, &filterOffset, &filterLength);	374 filter.FilterForValue(outX, &filterOffset, &filterLength);

392	375

393 // four pixels in a column per iteration.	376 // four pixels in a column per iteration.

394 int32x4_t accum0 = vdupq_n_s32(0);	377 int32x4_t accum0 = vdupq_n_s32(0);

395 int32x4_t accum1 = vdupq_n_s32(0);	378 int32x4_t accum1 = vdupq_n_s32(0);

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
430 ITERATION(srcData[1] + start, accum1);	413 ITERATION(srcData[1] + start, accum1);

431 ITERATION(srcData[2] + start, accum2);	414 ITERATION(srcData[2] + start, accum2);

432 ITERATION(srcData[3] + start, accum3);	415 ITERATION(srcData[3] + start, accum3);

433	416

434 start += 16;	417 start += 16;

435 filterValues += 4;	418 filterValues += 4;

436 }	419 }

437	420

438 int r = filterLength & 3;	421 int r = filterLength & 3;

439 if (r) {	422 if (r) {

440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;	423 int remainder_offset = (filterOffset + filterLength - r) * 4;

441 coeffs = vld1_s16(filterValues);	424 accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r);

442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));	425 accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r);

443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));	426 accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r);

444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));	427 accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r);

445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));

446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));

447

448 uint8x16_t pixels;

449 int16x8_t p01_16, p23_16;

450 int32x4_t p0, p1, p2, p3;

451

452 ITERATION(srcData[0] + start, accum0);

453 ITERATION(srcData[1] + start, accum1);

454 ITERATION(srcData[2] + start, accum2);

455 ITERATION(srcData[3] + start, accum3);

456 }	428 }

457	429

458 int16x4_t accum16;	430 int16x4_t accum16;

459 uint8x8_t res0, res1, res2, res3;	431 uint8x8_t res0, res1, res2, res3;

460	432

461 #define PACK_RESULT(accum, res) \	433 #define PACK_RESULT(accum, res) \

462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \	434 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \

463 accum16 = vqmovn_s32(accum); \	435 accum16 = vqmovn_s32(accum); \

464 res = vqmovun_s16(vcombine_s16(accum16, accum16));	436 res = vqmovun_s16(vcombine_s16(accum16, accum16));

465	437

466 PACK_RESULT(accum0, res0);	438 PACK_RESULT(accum0, res0);

467 PACK_RESULT(accum1, res1);	439 PACK_RESULT(accum1, res1);

468 PACK_RESULT(accum2, res2);	440 PACK_RESULT(accum2, res2);

469 PACK_RESULT(accum3, res3);	441 PACK_RESULT(accum3, res3);

470	442

471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);	443 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);

472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);	444 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);

473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);	445 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);

474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);	446 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);

475 outRow[0] += 4;	447 outRow[0] += 4;

476 outRow[1] += 4;	448 outRow[1] += 4;

477 outRow[2] += 4;	449 outRow[2] += 4;

478 outRow[3] += 4;	450 outRow[3] += 4;

479 }	451 }

480 }	452 }

481	453

482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {

483 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

485 // together to access invalid memory areas. We are not trying to align the

486 // coefficients right now due to the opaqueness of <vector> implementation.

487 // This has to be done after all \|AddFilter\| calls.

488 for (int i = 0; i < 8; ++i) {

489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

490 }

491 }

492

493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {	454 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {

494 procs->fExtraHorizontalReads = 3;

495 procs->fConvolveVertically = &convolveVertically_neon;	455 procs->fConvolveVertically = &convolveVertically_neon;

496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;	456 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;

497 procs->fConvolveHorizontally = &convolveHorizontally_neon;	457 procs->fConvolveHorizontally = &convolveHorizontally_neon;

498 procs->fApplySIMDPadding = &applySIMDPadding_neon;

499 }	458 }

OLD	NEW

« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »