src/opts/SkBitmapProcState_arm_neon.cpp - Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)

Patch Set: improve neon performance Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 Google Inc.	2 * Copyright 2012 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBitmapProcState.h"	8 #include "SkBitmapProcState.h"

9 #include "SkBitmapProcState_filter.h"	9 #include "SkBitmapProcState_filter.h"

10 #include "SkColorPriv.h"	10 #include "SkColorPriv.h"

(...skipping 124 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
135 accum += p1;	135 accum += p1;

136 accum += p2;	136 accum += p2;

137 accum += p3;	137 accum += p3;

138	138

139 // Advance the pointers	139 // Advance the pointers

140 rowToFilter += 16;	140 rowToFilter += 16;

141 filterValues += 4;	141 filterValues += 4;

142 }	142 }

143 int r = filterLength & 3;	143 int r = filterLength & 3;

144 if (r) {	144 if (r) {

145 const uint16_t mask[4][4] = {	145 #define ACCUM_REMAINDER(src, accum) { \

146 {0, 0, 0, 0},	146 int remainder[4] = {0}; \

147 {0xFFFF, 0, 0, 0},	147 const unsigned char* pixels_left = src + (filterOffset + filterLengt h - r) * 4;\

148 {0xFFFF, 0xFFFF, 0, 0},	148 for (int i = 0; i < r; i++) { \

149 {0xFFFF, 0xFFFF, 0xFFFF, 0}	149 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; \

150 };	150 remainder[0] += coeff * pixels_left[i * 4 + 0]; \

151 uint16x4_t coeffs;	151 remainder[1] += coeff * pixels_left[i * 4 + 1]; \

152 int16x4_t coeff0, coeff1, coeff2;	152 remainder[2] += coeff * pixels_left[i * 4 + 2]; \

153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));	153 remainder[3] += coeff * pixels_left[i * 4 + 3]; \

154 coeffs &= vld1_u16(&mask[r][0]);	154 } \

155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask0));	155 int32x4_t t{remainder[0], remainder[1], remainder[2], remainder[3]}; \

156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask1));	156 accum += t; }

157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask2));	157 ACCUM_REMAINDER(srcData, accum);

158

159 // Load pixels and calc

160 uint8x16_t pixels = vld1q_u8(rowToFilter);

161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));

162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));

163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);

164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);

165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);

166

167 accum += p0;

168 accum += p1;

169 accum += p2;

170 }	158 }

171	159

172 // Bring this value back in range. All of the filter scaling factors	160 // Bring this value back in range. All of the filter scaling factors

173 // are in fixed point with kShiftBits bits of fractional part.	161 // are in fixed point with kShiftBits bits of fractional part.

174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);	162 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

175	163

176 // Pack and store the new pixel.	164 // Pack and store the new pixel.

177 int16x4_t accum16 = vqmovn_s32(accum);	165 int16x4_t accum16 = vqmovn_s32(accum);

178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));	166 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);	167 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);

(...skipping 187 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
367 unsigned char* outRow[4],	355 unsigned char* outRow[4],

368 size_t outRowBytes) {	356 size_t outRowBytes) {

369	357

370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);	358 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);	359 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);	360 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);	361 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

374 int num_values = filter.numValues();	362 int num_values = filter.numValues();

375	363

376 int filterOffset, filterLength;	364 int filterOffset, filterLength;

377 // \|mask\| will be used to decimate all extra filter coefficients that are

378 // loaded by SIMD when \|filter_length\| is not divisible by 4.

379 // mask[0] is not used in following algorithm.

380 const uint16_t mask[4][4] = {

381 {0, 0, 0, 0},

382 {0xFFFF, 0, 0, 0},

383 {0xFFFF, 0xFFFF, 0, 0},

384 {0xFFFF, 0xFFFF, 0xFFFF, 0}

385 };

386	365

387 // Output one pixel each iteration, calculating all channels (RGBA) together .	366 // Output one pixel each iteration, calculating all channels (RGBA) together .

388 for (int outX = 0; outX < num_values; outX++) {	367 for (int outX = 0; outX < num_values; outX++) {

389	368

390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =	369 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

391 filter.FilterForValue(outX, &filterOffset, &filterLength);	370 filter.FilterForValue(outX, &filterOffset, &filterLength);

392	371

393 // four pixels in a column per iteration.	372 // four pixels in a column per iteration.

394 int32x4_t accum0 = vdupq_n_s32(0);	373 int32x4_t accum0 = vdupq_n_s32(0);

395 int32x4_t accum1 = vdupq_n_s32(0);	374 int32x4_t accum1 = vdupq_n_s32(0);

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
430 ITERATION(srcData[1] + start, accum1);	409 ITERATION(srcData[1] + start, accum1);

431 ITERATION(srcData[2] + start, accum2);	410 ITERATION(srcData[2] + start, accum2);

432 ITERATION(srcData[3] + start, accum3);	411 ITERATION(srcData[3] + start, accum3);

433	412

434 start += 16;	413 start += 16;

435 filterValues += 4;	414 filterValues += 4;

436 }	415 }

437	416

438 int r = filterLength & 3;	417 int r = filterLength & 3;

439 if (r) {	418 if (r) {

440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;	419 ACCUM_REMAINDER(srcData[0], accum0);

441 coeffs = vld1_s16(filterValues);	420 ACCUM_REMAINDER(srcData[1], accum1);

442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));	421 ACCUM_REMAINDER(srcData[2], accum2);

443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));	422 ACCUM_REMAINDER(srcData[3], accum3);

444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));

445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));

446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));

447

448 uint8x16_t pixels;

449 int16x8_t p01_16, p23_16;

450 int32x4_t p0, p1, p2, p3;

451

452 ITERATION(srcData[0] + start, accum0);

453 ITERATION(srcData[1] + start, accum1);

454 ITERATION(srcData[2] + start, accum2);

455 ITERATION(srcData[3] + start, accum3);

456 }	423 }

457	424

458 int16x4_t accum16;	425 int16x4_t accum16;

459 uint8x8_t res0, res1, res2, res3;	426 uint8x8_t res0, res1, res2, res3;

460	427

461 #define PACK_RESULT(accum, res) \	428 #define PACK_RESULT(accum, res) \

462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \	429 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \

463 accum16 = vqmovn_s32(accum); \	430 accum16 = vqmovn_s32(accum); \

464 res = vqmovun_s16(vcombine_s16(accum16, accum16));	431 res = vqmovun_s16(vcombine_s16(accum16, accum16));

465	432

466 PACK_RESULT(accum0, res0);	433 PACK_RESULT(accum0, res0);

467 PACK_RESULT(accum1, res1);	434 PACK_RESULT(accum1, res1);

468 PACK_RESULT(accum2, res2);	435 PACK_RESULT(accum2, res2);

469 PACK_RESULT(accum3, res3);	436 PACK_RESULT(accum3, res3);

470	437

471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);	438 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);

472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);	439 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);

473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);	440 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);

474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);	441 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);

475 outRow[0] += 4;	442 outRow[0] += 4;

476 outRow[1] += 4;	443 outRow[1] += 4;

477 outRow[2] += 4;	444 outRow[2] += 4;

478 outRow[3] += 4;	445 outRow[3] += 4;

479 }	446 }

480 }	447 }

481	448

482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {

483 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

485 // together to access invalid memory areas. We are not trying to align the

486 // coefficients right now due to the opaqueness of <vector> implementation.

487 // This has to be done after all \|AddFilter\| calls.

488 for (int i = 0; i < 8; ++i) {

489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

490 }

491 }

492

493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {	449 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {

494 procs->fExtraHorizontalReads = 3;

495 procs->fConvolveVertically = &convolveVertically_neon;	450 procs->fConvolveVertically = &convolveVertically_neon;

496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;	451 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;

497 procs->fConvolveHorizontally = &convolveHorizontally_neon;	452 procs->fConvolveHorizontally = &convolveHorizontally_neon;

498 procs->fApplySIMDPadding = &applySIMDPadding_neon;

499 }	453 }

OLD	NEW

« src/opts/SkBitmapFilter_opts_SSE2.cpp ('K') | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »