| Index: src/opts/SkBitmapProcState_arm_neon.cpp
|
| diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp
|
| index ce2656da65493194e4bc1785c4dc3439a93b3357..4193e6af071d10af8fb63be62d804a5722b336b9 100644
|
| --- a/src/opts/SkBitmapProcState_arm_neon.cpp
|
| +++ b/src/opts/SkBitmapProcState_arm_neon.cpp
|
| @@ -83,6 +83,20 @@ const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
|
| #include <arm_neon.h>
|
| #include "SkConvolver.h"
|
|
|
| +static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
|
| + const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) {
|
| + int remainder[4] = {0};
|
| + for (int i = 0; i < r; i++) {
|
| + SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
|
| + remainder[0] += coeff * pixels_left[i * 4 + 0];
|
| + remainder[1] += coeff * pixels_left[i * 4 + 1];
|
| + remainder[2] += coeff * pixels_left[i * 4 + 2];
|
| + remainder[3] += coeff * pixels_left[i * 4 + 3];
|
| + }
|
| + int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
|
| + accum += t;
|
| +}
|
| +
|
| // Convolves horizontally along a single row. The row data is given in
|
| // |srcData| and continues for the numValues() of the filter.
|
| void convolveHorizontally_neon(const unsigned char* srcData,
|
| @@ -140,33 +154,11 @@ void convolveHorizontally_neon(const unsigned char* srcData,
|
| rowToFilter += 16;
|
| filterValues += 4;
|
| }
|
| +
|
| int r = filterLength & 3;
|
| if (r) {
|
| - const uint16_t mask[4][4] = {
|
| - {0, 0, 0, 0},
|
| - {0xFFFF, 0, 0, 0},
|
| - {0xFFFF, 0xFFFF, 0, 0},
|
| - {0xFFFF, 0xFFFF, 0xFFFF, 0}
|
| - };
|
| - uint16x4_t coeffs;
|
| - int16x4_t coeff0, coeff1, coeff2;
|
| - coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
|
| - coeffs &= vld1_u16(&mask[r][0]);
|
| - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
|
| - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
|
| - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
|
| -
|
| - // Load pixels and calc
|
| - uint8x16_t pixels = vld1q_u8(rowToFilter);
|
| - int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
|
| - int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
|
| - int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
|
| - int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
|
| - int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
|
| -
|
| - accum += p0;
|
| - accum += p1;
|
| - accum += p2;
|
| + int remainder_offset = (filterOffset + filterLength - r) * 4;
|
| + accum_remainder(srcData + remainder_offset, filterValues, accum, r);
|
| }
|
|
|
| // Bring this value back in range. All of the filter scaling factors
|
| @@ -374,15 +366,6 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
|
| int num_values = filter.numValues();
|
|
|
| int filterOffset, filterLength;
|
| - // |mask| will be used to decimate all extra filter coefficients that are
|
| - // loaded by SIMD when |filter_length| is not divisible by 4.
|
| - // mask[0] is not used in following algorithm.
|
| - const uint16_t mask[4][4] = {
|
| - {0, 0, 0, 0},
|
| - {0xFFFF, 0, 0, 0},
|
| - {0xFFFF, 0xFFFF, 0, 0},
|
| - {0xFFFF, 0xFFFF, 0xFFFF, 0}
|
| - };
|
|
|
| // Output one pixel each iteration, calculating all channels (RGBA) together.
|
| for (int outX = 0; outX < num_values; outX++) {
|
| @@ -437,22 +420,11 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
|
|
|
| int r = filterLength & 3;
|
| if (r) {
|
| - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
|
| - coeffs = vld1_s16(filterValues);
|
| - coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
|
| - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
|
| - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
|
| - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
|
| - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
|
| -
|
| - uint8x16_t pixels;
|
| - int16x8_t p01_16, p23_16;
|
| - int32x4_t p0, p1, p2, p3;
|
| -
|
| - ITERATION(srcData[0] + start, accum0);
|
| - ITERATION(srcData[1] + start, accum1);
|
| - ITERATION(srcData[2] + start, accum2);
|
| - ITERATION(srcData[3] + start, accum3);
|
| + int remainder_offset = (filterOffset + filterLength - r) * 4;
|
| + accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r);
|
| + accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r);
|
| + accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r);
|
| + accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r);
|
| }
|
|
|
| int16x4_t accum16;
|
| @@ -479,21 +451,8 @@ void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
|
| }
|
| }
|
|
|
| -void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
|
| - // Padding |paddingCount| of more dummy coefficients after the coefficients
|
| - // of last filter to prevent SIMD instructions which load 8 or 16 bytes
|
| - // together to access invalid memory areas. We are not trying to align the
|
| - // coefficients right now due to the opaqueness of <vector> implementation.
|
| - // This has to be done after all |AddFilter| calls.
|
| - for (int i = 0; i < 8; ++i) {
|
| - filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
|
| - }
|
| -}
|
| -
|
| void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
|
| - procs->fExtraHorizontalReads = 3;
|
| procs->fConvolveVertically = &convolveVertically_neon;
|
| procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
|
| procs->fConvolveHorizontally = &convolveHorizontally_neon;
|
| - procs->fApplySIMDPadding = &applySIMDPadding_neon;
|
| }
|
|
|