| Index: src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| index ecaad23d76f1069eee17329424fd0446b0c352b0..324ac1a5c1e8a9791d90d8ea0d22a997f5fc3d4c 100644
|
| --- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| +++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| @@ -40,6 +40,20 @@ static inline void print128f(__m128 value) {
|
| }
|
| #endif
|
|
|
| +static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
|
| + const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& accum, int r) {
|
| + int remainder[4] = {0};
|
| + for (int i = 0; i < r; i++) {
|
| + SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
|
| + remainder[0] += coeff * pixels_left[i * 4 + 0];
|
| + remainder[1] += coeff * pixels_left[i * 4 + 1];
|
| + remainder[2] += coeff * pixels_left[i * 4 + 2];
|
| + remainder[3] += coeff * pixels_left[i * 4 + 3];
|
| + }
|
| + __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
|
| + accum = _mm_add_epi32(accum, t);
|
| +}
|
| +
|
| // Convolves horizontally along a single row. The row data is given in
|
| // |src_data| and continues for the num_values() of the filter.
|
| void convolveHorizontally_SSE2(const unsigned char* src_data,
|
| @@ -50,13 +64,6 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
|
|
|
| int filter_offset, filter_length;
|
| __m128i zero = _mm_setzero_si128();
|
| - __m128i mask[4];
|
| - // |mask| will be used to decimate all extra filter coefficients that are
|
| - // loaded by SIMD when |filter_length| is not divisible by 4.
|
| - // mask[0] is not used in following algorithm.
|
| - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
| - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
| - mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
|
|
| // Output one pixel each iteration, calculating all channels (RGBA) together.
|
| for (int out_x = 0; out_x < num_values; out_x++) {
|
| @@ -120,38 +127,12 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
|
| filter_values += 4;
|
| }
|
|
|
| - // When |filter_length| is not divisible by 4, we need to decimate some of
|
| - // the filter coefficient that was loaded incorrectly to zero; Other than
|
| - // that the algorithm is same with above, exceot that the 4th pixel will be
|
| - // always absent.
|
| - int r = filter_length&3;
|
| + // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3
|
| + // coefficients one at a time.
|
| + int r = filter_length & 3;
|
| if (r) {
|
| - // Note: filter_values must be padded to align_up(filter_offset, 8).
|
| - __m128i coeff, coeff16;
|
| - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
|
| - // Mask out extra filter taps.
|
| - coeff = _mm_and_si128(coeff, mask[r]);
|
| - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
|
| - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
|
| -
|
| - // Note: line buffer must be padded to align_up(filter_offset, 16).
|
| - // We resolve this by use C-version for the last horizontal line.
|
| - __m128i src8 = _mm_loadu_si128(row_to_filter);
|
| - __m128i src16 = _mm_unpacklo_epi8(src8, zero);
|
| - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
|
| - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
|
| - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
|
| - accum = _mm_add_epi32(accum, t);
|
| - t = _mm_unpackhi_epi16(mul_lo, mul_hi);
|
| - accum = _mm_add_epi32(accum, t);
|
| -
|
| - src16 = _mm_unpackhi_epi8(src8, zero);
|
| - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
|
| - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
|
| - mul_hi = _mm_mulhi_epi16(src16, coeff16);
|
| - mul_lo = _mm_mullo_epi16(src16, coeff16);
|
| - t = _mm_unpacklo_epi16(mul_lo, mul_hi);
|
| - accum = _mm_add_epi32(accum, t);
|
| + int remainder_offset = (filter_offset + filter_length - r) * 4;
|
| + accum_remainder(src_data + remainder_offset, filter_values, accum, r);
|
| }
|
|
|
| // Shift right for fixed point implementation.
|
| @@ -182,13 +163,6 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
|
|
|
| int filter_offset, filter_length;
|
| __m128i zero = _mm_setzero_si128();
|
| - __m128i mask[4];
|
| - // |mask| will be used to decimate all extra filter coefficients that are
|
| - // loaded by SIMD when |filter_length| is not divisible by 4.
|
| - // mask[0] is not used in following algorithm.
|
| - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
| - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
| - mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
|
|
| // Output one pixel each iteration, calculating all channels (RGBA) together.
|
| for (int out_x = 0; out_x < num_values; out_x++) {
|
| @@ -245,24 +219,11 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
|
|
|
| int r = filter_length & 3;
|
| if (r) {
|
| - // Note: filter_values must be padded to align_up(filter_offset, 8);
|
| - __m128i coeff;
|
| - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
|
| - // Mask out extra filter taps.
|
| - coeff = _mm_and_si128(coeff, mask[r]);
|
| -
|
| - __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
|
| - /* c1 c1 c1 c1 c0 c0 c0 c0 */
|
| - coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
|
| - __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
|
| - coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
|
| -
|
| - __m128i src8, src16, mul_hi, mul_lo, t;
|
| -
|
| - ITERATION(src_data[0] + start, accum0);
|
| - ITERATION(src_data[1] + start, accum1);
|
| - ITERATION(src_data[2] + start, accum2);
|
| - ITERATION(src_data[3] + start, accum3);
|
| + int remainder_offset = (filter_offset + filter_length - r) * 4;
|
| + accum_remainder(src_data[0] + remainder_offset, filter_values, accum0, r);
|
| + accum_remainder(src_data[1] + remainder_offset, filter_values, accum1, r);
|
| + accum_remainder(src_data[2] + remainder_offset, filter_values, accum2, r);
|
| + accum_remainder(src_data[3] + remainder_offset, filter_values, accum3, r);
|
| }
|
|
|
| accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
|
| @@ -487,14 +448,3 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
|
| out_row);
|
| }
|
| }
|
| -
|
| -void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
|
| - // Padding |paddingCount| of more dummy coefficients after the coefficients
|
| - // of last filter to prevent SIMD instructions which load 8 or 16 bytes
|
| - // together to access invalid memory areas. We are not trying to align the
|
| - // coefficients right now due to the opaqueness of <vector> implementation.
|
| - // This has to be done after all |AddFilter| calls.
|
| - for (int i = 0; i < 8; ++i) {
|
| - filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
|
| - }
|
| -}
|
|
|