src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes

Unified Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)

Patch Set: improve neon performance Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/opts/SkBitmapFilter_opts_SSE2.cpp

diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp

index ecaad23d76f1069eee17329424fd0446b0c352b0..61c26ff9a68e3d974d8db33da3ce8e1fbbdd770a 100644

--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp

+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp

@@ -50,13 +50,6 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,

int filter_offset, filter_length;

__m128i zero = _mm_setzero_si128();

- __m128i mask[4];

- // |mask| will be used to decimate all extra filter coefficients that are

- // loaded by SIMD when |filter_length| is not divisible by 4.

- // mask[0] is not used in following algorithm.

- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

// Output one pixel each iteration, calculating all channels (RGBA) together.

for (int out_x = 0; out_x < num_values; out_x++) {

@@ -120,38 +113,23 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,

filter_values += 4;

}

- // When |filter_length| is not divisible by 4, we need to decimate some of

- // the filter coefficient that was loaded incorrectly to zero; Other than

- // that the algorithm is same with above, exceot that the 4th pixel will be

- // always absent.

- int r = filter_length&3;

+ // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3

+ // coefficients one at a time.

+ int r = filter_length & 3;

if (r) {

- // Note: filter_values must be padded to align_up(filter_offset, 8).

- __m128i coeff, coeff16;

- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

- // Mask out extra filter taps.

- coeff = _mm_and_si128(coeff, mask[r]);

- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

- // Note: line buffer must be padded to align_up(filter_offset, 16).

- // We resolve this by use C-version for the last horizontal line.

- __m128i src8 = _mm_loadu_si128(row_to_filter);

- __m128i src16 = _mm_unpacklo_epi8(src8, zero);

- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

- accum = _mm_add_epi32(accum, t);

- t = _mm_unpackhi_epi16(mul_lo, mul_hi);

- accum = _mm_add_epi32(accum, t);

- src16 = _mm_unpackhi_epi8(src8, zero);

- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

- mul_hi = _mm_mulhi_epi16(src16, coeff16);

- mul_lo = _mm_mullo_epi16(src16, coeff16);

- t = _mm_unpacklo_epi16(mul_lo, mul_hi);

- accum = _mm_add_epi32(accum, t);

+#define ACCUM_REMAINDER(src, accum) { \

mtklein_C 2016/11/09 09:25:04 Can we make these ACCUM_REMAINDERs lambdas or stat

xiangze.zhang 2016/11/10 03:16:56 Done.

+ int remainder[4] = {0}; \

+ const unsigned char* pixels_left = src + (filter_offset + filter_length - r) * 4; \

+ for (int i = 0; i < r; i++) { \

+ SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; \

+ remainder[0] += coeff * pixels_left[i * 4 + 0]; \

+ remainder[1] += coeff * pixels_left[i * 4 + 1]; \

+ remainder[2] += coeff * pixels_left[i * 4 + 2]; \

+ remainder[3] += coeff * pixels_left[i * 4 + 3]; \

+ } \

+ __m128i t = _mm_set_epi32(remainder[3], remainder[2], remainder[1], remainder[0]); \

mtklein_C 2016/11/09 09:25:04 This is not a big deal, but we have been trending

xiangze.zhang 2016/11/10 03:16:56 Done.

+ accum = _mm_add_epi32(accum, t); }

mtklein_C 2016/11/09 09:25:04 If we do need to use macros, it's probably best to

+ ACCUM_REMAINDER(src_data, accum);

}

// Shift right for fixed point implementation.

@@ -182,13 +160,6 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

int filter_offset, filter_length;

__m128i zero = _mm_setzero_si128();

- __m128i mask[4];

- // |mask| will be used to decimate all extra filter coefficients that are

- // loaded by SIMD when |filter_length| is not divisible by 4.

- // mask[0] is not used in following algorithm.

- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

// Output one pixel each iteration, calculating all channels (RGBA) together.

for (int out_x = 0; out_x < num_values; out_x++) {

@@ -245,24 +216,10 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

int r = filter_length & 3;

if (r) {

- // Note: filter_values must be padded to align_up(filter_offset, 8);

- __m128i coeff;

- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

- // Mask out extra filter taps.

- coeff = _mm_and_si128(coeff, mask[r]);

- __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

- /* c1 c1 c1 c1 c0 c0 c0 c0 */

- coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

- __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

- coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

- __m128i src8, src16, mul_hi, mul_lo, t;

- ITERATION(src_data[0] + start, accum0);

- ITERATION(src_data[1] + start, accum1);

- ITERATION(src_data[2] + start, accum2);

- ITERATION(src_data[3] + start, accum3);

+ ACCUM_REMAINDER(src_data[0], accum0);

+ ACCUM_REMAINDER(src_data[1], accum1);

+ ACCUM_REMAINDER(src_data[2], accum2);

+ ACCUM_REMAINDER(src_data[3], accum3);

}

accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

@@ -487,14 +444,3 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt

out_row);

}

-void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {

- // Padding |paddingCount| of more dummy coefficients after the coefficients

- // of last filter to prevent SIMD instructions which load 8 or 16 bytes

- // together to access invalid memory areas. We are not trying to align the

- // coefficients right now due to the opaqueness of <vector> implementation.

- // This has to be done after all |AddFilter| calls.

- for (int i = 0; i < 8; ++i) {

- filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));

- }

« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkBitmapProcState_arm_neon.cpp » ('j') | no next file with comments »