Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(995)

Unified Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)
Patch Set: Change macros to functions Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkBitmapProcState_arm_neon.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkBitmapFilter_opts_SSE2.cpp
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index ecaad23d76f1069eee17329424fd0446b0c352b0..324ac1a5c1e8a9791d90d8ea0d22a997f5fc3d4c 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -40,6 +40,20 @@ static inline void print128f(__m128 value) {
}
#endif
+static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
+ const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& accum, int r) {
+ int remainder[4] = {0};
+ for (int i = 0; i < r; i++) {
+ SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
+ remainder[0] += coeff * pixels_left[i * 4 + 0];
+ remainder[1] += coeff * pixels_left[i * 4 + 1];
+ remainder[2] += coeff * pixels_left[i * 4 + 2];
+ remainder[3] += coeff * pixels_left[i * 4 + 3];
+ }
+ __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
+ accum = _mm_add_epi32(accum, t);
+}
+
// Convolves horizontally along a single row. The row data is given in
// |src_data| and continues for the num_values() of the filter.
void convolveHorizontally_SSE2(const unsigned char* src_data,
@@ -50,13 +64,6 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
- __m128i mask[4];
- // |mask| will be used to decimate all extra filter coefficients that are
- // loaded by SIMD when |filter_length| is not divisible by 4.
- // mask[0] is not used in following algorithm.
- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
@@ -120,38 +127,12 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
filter_values += 4;
}
- // When |filter_length| is not divisible by 4, we need to decimate some of
- // the filter coefficient that was loaded incorrectly to zero; Other than
- // that the algorithm is same with above, exceot that the 4th pixel will be
- // always absent.
- int r = filter_length&3;
+ // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3
+ // coefficients one at a time.
+ int r = filter_length & 3;
if (r) {
- // Note: filter_values must be padded to align_up(filter_offset, 8).
- __m128i coeff, coeff16;
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // Mask out extra filter taps.
- coeff = _mm_and_si128(coeff, mask[r]);
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
- // Note: line buffer must be padded to align_up(filter_offset, 16).
- // We resolve this by use C-version for the last horizontal line.
- __m128i src8 = _mm_loadu_si128(row_to_filter);
- __m128i src16 = _mm_unpacklo_epi8(src8, zero);
- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
-
- src16 = _mm_unpackhi_epi8(src8, zero);
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
- mul_hi = _mm_mulhi_epi16(src16, coeff16);
- mul_lo = _mm_mullo_epi16(src16, coeff16);
- t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
+ int remainder_offset = (filter_offset + filter_length - r) * 4;
+ accum_remainder(src_data + remainder_offset, filter_values, accum, r);
}
// Shift right for fixed point implementation.
@@ -182,13 +163,6 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
- __m128i mask[4];
- // |mask| will be used to decimate all extra filter coefficients that are
- // loaded by SIMD when |filter_length| is not divisible by 4.
- // mask[0] is not used in following algorithm.
- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = 0; out_x < num_values; out_x++) {
@@ -245,24 +219,11 @@ void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
int r = filter_length & 3;
if (r) {
- // Note: filter_values must be padded to align_up(filter_offset, 8);
- __m128i coeff;
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // Mask out extra filter taps.
- coeff = _mm_and_si128(coeff, mask[r]);
-
- __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- /* c1 c1 c1 c1 c0 c0 c0 c0 */
- coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
- __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
- __m128i src8, src16, mul_hi, mul_lo, t;
-
- ITERATION(src_data[0] + start, accum0);
- ITERATION(src_data[1] + start, accum1);
- ITERATION(src_data[2] + start, accum2);
- ITERATION(src_data[3] + start, accum3);
+ int remainder_offset = (filter_offset + filter_length - r) * 4;
+ accum_remainder(src_data[0] + remainder_offset, filter_values, accum0, r);
+ accum_remainder(src_data[1] + remainder_offset, filter_values, accum1, r);
+ accum_remainder(src_data[2] + remainder_offset, filter_values, accum2, r);
+ accum_remainder(src_data[3] + remainder_offset, filter_values, accum3, r);
}
accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
@@ -487,14 +448,3 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
out_row);
}
}
-
-void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
- // Padding |paddingCount| of more dummy coefficients after the coefficients
- // of last filter to prevent SIMD instructions which load 8 or 16 bytes
- // together to access invalid memory areas. We are not trying to align the
- // coefficients right now due to the opaqueness of <vector> implementation.
- // This has to be done after all |AddFilter| calls.
- for (int i = 0; i < 8; ++i) {
- filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
- }
-}
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkBitmapProcState_arm_neon.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698