src/opts/SkBitmapProcState_arm_neon.cpp - Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes

Unified Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)

Patch Set: improve neon performance Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/opts/SkBitmapProcState_arm_neon.cpp

diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp

index ce2656da65493194e4bc1785c4dc3439a93b3357..432db2166e566cbcb3473aa1c76cde6bef48af94 100644

--- a/src/opts/SkBitmapProcState_arm_neon.cpp

+++ b/src/opts/SkBitmapProcState_arm_neon.cpp

@@ -142,31 +142,19 @@ void convolveHorizontally_neon(const unsigned char* srcData,

}

int r = filterLength & 3;

if (r) {

- const uint16_t mask[4][4] = {

- {0, 0, 0, 0},

- {0xFFFF, 0, 0, 0},

- {0xFFFF, 0xFFFF, 0, 0},

- {0xFFFF, 0xFFFF, 0xFFFF, 0}

- };

- uint16x4_t coeffs;

- int16x4_t coeff0, coeff1, coeff2;

- coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));

- coeffs &= vld1_u16(&mask[r][0]);

- coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));

- coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));

- coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));

- // Load pixels and calc

- uint8x16_t pixels = vld1q_u8(rowToFilter);

- int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));

- int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));

- int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);

- int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);

- int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);

- accum += p0;

- accum += p1;

- accum += p2;

+#define ACCUM_REMAINDER(src, accum) { \

+ int remainder[4] = {0}; \

+ const unsigned char* pixels_left = src + (filterOffset + filterLength - r) * 4;\

+ for (int i = 0; i < r; i++) { \

+ SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; \

+ remainder[0] += coeff * pixels_left[i * 4 + 0]; \

+ remainder[1] += coeff * pixels_left[i * 4 + 1]; \

+ remainder[2] += coeff * pixels_left[i * 4 + 2]; \

+ remainder[3] += coeff * pixels_left[i * 4 + 3]; \

+ } \

+ int32x4_t t{remainder[0], remainder[1], remainder[2], remainder[3]}; \

+ accum += t; }

+ ACCUM_REMAINDER(srcData, accum);

}

// Bring this value back in range. All of the filter scaling factors

@@ -374,15 +362,6 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],

int num_values = filter.numValues();

int filterOffset, filterLength;

- // |mask| will be used to decimate all extra filter coefficients that are

- // loaded by SIMD when |filter_length| is not divisible by 4.

- // mask[0] is not used in following algorithm.

- const uint16_t mask[4][4] = {

- {0, 0, 0, 0},

- {0xFFFF, 0, 0, 0},

- {0xFFFF, 0xFFFF, 0, 0},

- {0xFFFF, 0xFFFF, 0xFFFF, 0}

- };

// Output one pixel each iteration, calculating all channels (RGBA) together.

for (int outX = 0; outX < num_values; outX++) {

@@ -437,22 +416,10 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],

int r = filterLength & 3;

if (r) {

- int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

- coeffs = vld1_s16(filterValues);

- coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));

- coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));

- coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));

- coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));

- coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));

- uint8x16_t pixels;

- int16x8_t p01_16, p23_16;

- int32x4_t p0, p1, p2, p3;

- ITERATION(srcData[0] + start, accum0);

- ITERATION(srcData[1] + start, accum1);

- ITERATION(srcData[2] + start, accum2);

- ITERATION(srcData[3] + start, accum3);

+ ACCUM_REMAINDER(srcData[0], accum0);

+ ACCUM_REMAINDER(srcData[1], accum1);

+ ACCUM_REMAINDER(srcData[2], accum2);

+ ACCUM_REMAINDER(srcData[3], accum3);

}

int16x4_t accum16;

@@ -479,21 +446,8 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],

}

-void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {

- // Padding |paddingCount| of more dummy coefficients after the coefficients

- // of last filter to prevent SIMD instructions which load 8 or 16 bytes

- // together to access invalid memory areas. We are not trying to align the

- // coefficients right now due to the opaqueness of <vector> implementation.

- // This has to be done after all |AddFilter| calls.

- for (int i = 0; i < 8; ++i) {

- filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));

- }

void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {

- procs->fExtraHorizontalReads = 3;

procs->fConvolveVertically = &convolveVertically_neon;

procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;

procs->fConvolveHorizontally = &convolveHorizontally_neon;

- procs->fApplySIMDPadding = &applySIMDPadding_neon;

}

« src/opts/SkBitmapFilter_opts_SSE2.cpp ('K') | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »