Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(859)

Unified Diff: src/opts/SkBitmapFilter_opts.h

Issue 2526733002: Add AVX2 version of ConvolveVertically (Closed)
Patch Set: Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkOpts_hsw.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkBitmapFilter_opts.h
diff --git a/src/opts/SkBitmapFilter_opts.h b/src/opts/SkBitmapFilter_opts.h
index f22b5c2368050681f6979fe08dfaeeec8405c6ca..94d97f74a80de6f1251a78d47010a6ec34a0c514 100644
--- a/src/opts/SkBitmapFilter_opts.h
+++ b/src/opts/SkBitmapFilter_opts.h
@@ -10,7 +10,9 @@
#include "SkConvolver.h"
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ #include <immintrin.h>
mtklein_C 2016/12/05 18:42:46 I think we can just use immintrin.h for everything
xiangze.zhang 2016/12/07 11:17:01 Done.
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <emmintrin.h>
#elif defined(SK_ARM_HAS_NEON)
#include <arm_neon.h>
@@ -18,7 +20,160 @@
namespace SK_OPTS_NS {
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+
+ static SK_ALWAYS_INLINE
+ void ComputeCoefficientRow(SkConvolutionFilter1D::ConvolutionFixed filterValue, const unsigned char* sourceDataRows,
mtklein_C 2016/12/05 18:42:45 Typically we name static functions with lower_case
xiangze.zhang 2016/12/07 11:17:01 Done.
+ __m256i& Y01, __m256i& Y23, __m256i& Y45, __m256i& Y67) {
+ __m256i coefs = _mm256_set1_epi16(filterValue);
+ __m256i pixels = _mm256_lddqu_si256(reinterpret_cast<const __m256i *>(sourceDataRows));
mtklein_C 2016/12/05 18:42:45 I have always shied away from lddqu, instead just
xiangze.zhang 2016/12/07 11:17:01 Done.
+ __m256i zero = _mm256_setzero_si256();
+
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ __m256i Yt1 = _mm256_unpacklo_epi8(pixels, zero);
mtklein_C 2016/12/05 18:42:45 Does Yt1 mean ymm, temporary #1? We usually find
xiangze.zhang 2016/12/07 11:17:01 Done.
+ // [32] cja3 cjb3 cjg3 cjr3 cja2 cjb2 cjg2 cjr2
+ __m256i Yt2 = _mm256_mulhi_epi16(Yt1, coefs);
+ // [32] cja1 cjb1 cjg1 cjr1 cja0 cjb0 cjg0 cjr0
+ Yt1 = _mm256_mullo_epi16(Yt1, coefs);
+
+ // pixel 0 and 1
+ __m256i Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);
+ Y01 = _mm256_add_epi32(Y01, Yt3);
+ // pixel 2 and 3
+ Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);
+ Y23 = _mm256_add_epi32(Y23, Yt3);
+
+ // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4
+ Yt1 = _mm256_unpackhi_epi8(pixels, zero);
+ // [32] cja7 cjb7 cjg7 cjr7 cja6 cjb6 cjg6 cjr6
+ Yt2 = _mm256_mulhi_epi16(Yt1, coefs);
+ // [32] cja5 cjb5 cjg5 cjr5 cja4 cjb4 cjg4 cjr4
+ Yt1 = _mm256_mullo_epi16(Yt1, coefs);
+
+ // pixel 4 and 5
+ Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);
+ Y45 = _mm256_add_epi32(Y45, Yt3);
+ // pixel 6 and 7
+ Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);
+ Y67 = _mm256_add_epi32(Y67, Yt3);
+ }
+
+ template<bool hasAlpha>
+ void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+ int filterLength,
+ unsigned char* const * sourceDataRows,
+ int pixelWidth,
+ unsigned char* outRow) {
+
+ int outX, filterY;
+ int width = pixelWidth & ~7;
+ int length = filterLength & ~3;
+
+ __m256i Yt0, Yt1, Yt2;
+
+ // Output eight pixels per iteration (32 bytes).
+ for (outX = 0; outX < width; outX += 8) {
+ // Accumulated result for each pixel. 32 bits per RGBA channel.
+ __m256i Y01 = _mm256_setzero_si256();
+ __m256i Y23 = _mm256_setzero_si256();
+ __m256i Y45 = _mm256_setzero_si256();
+ __m256i Y67 = _mm256_setzero_si256();
+
+ // Convolve with 4 filter coefficient per iteration.
+ for (int filterY = 0; filterY < length; filterY += 4) {
+ ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filterY] + outX * 4, Y01, Y23, Y45, Y67);
mtklein_C 2016/12/05 18:42:46 This might line up neater with a couple "+ 0".
xiangze.zhang 2016/12/07 11:17:01 Done.
+ ComputeCoefficientRow(filterValues[filterY + 1], sourceDataRows[filterY + 1] + outX * 4, Y01, Y23, Y45, Y67);
+ ComputeCoefficientRow(filterValues[filterY + 2], sourceDataRows[filterY + 2] + outX * 4, Y01, Y23, Y45, Y67);
+ ComputeCoefficientRow(filterValues[filterY + 3], sourceDataRows[filterY + 3] + outX * 4, Y01, Y23, Y45, Y67);
+ }
+ for (filterY = length; filterY < filterLength; filterY++) {
+ ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filterY] + outX * 4, Y01, Y23, Y45, Y67);
+ }
+
+ // Shift right for fixed point implementation.
+ Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);
+ Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);
+ Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);
+ Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);
+
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ Yt0 = _mm256_packs_epi32(Y01, Y23);
+
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4
+ Yt1 = _mm256_packs_epi32(Y45, Y67);
+
+ // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+ // [8] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ Yt0 = _mm256_packus_epi16(Yt0, Yt1);
+
+ if (hasAlpha) {
+ // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+ Yt1 = _mm256_srli_epi32(Yt0, 8);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.
+ // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+ Yt1 = _mm256_srli_epi32(Yt0, 16);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.
+ // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+ Yt2 = _mm256_slli_epi32(Yt2, 24);
+
+ // Make sure the value of alpha channel is always larger than maximum
mtklein_C 2016/12/05 18:42:46 This comment might be better moved up just under i
xiangze.zhang 2016/12/07 11:17:01 Done.
+ // value of color channels.
+ Yt0 = _mm256_max_epu8(Yt2, Yt0);
+ } else {
+ __m256i mask = _mm256_set1_epi32(0xff000000);
+ Yt0 = _mm256_or_si256(Yt0, mask);
+ }
+
+ // Store the convolution result (32 bytes) and advance the pixel pointers.
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(outRow), Yt0);
+ outRow += 32;
+ }
+
+ if (pixelWidth & 7) {
+ __m256i Y01 = _mm256_setzero_si256();
+ __m256i Y23 = _mm256_setzero_si256();
+ __m256i Y45 = _mm256_setzero_si256();
+ __m256i Y67 = _mm256_setzero_si256();
+
+ for (int filterY = 0; filterY < filterLength; filterY++) {
+ ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filterY] + outX * 4, Y01, Y23, Y45, Y67);
+ }
+
+ Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);
+ Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);
+ Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);
+ Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);
+
+ Yt0 = _mm256_packs_epi32(Y01, Y23);
+ Yt1 = _mm256_packs_epi32(Y45, Y67);
+ Yt0 = _mm256_packus_epi16(Yt0, Yt1);
+
+ if (hasAlpha) {
+ Yt1 = _mm256_srli_epi32(Yt0, 8);
+ Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.
mtklein_C 2016/12/05 18:42:45 This is the sort of code that using more variables
xiangze.zhang 2016/12/07 11:17:01 Done.
+ Yt1 = _mm256_srli_epi32(Yt0, 16);
+ Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.
+ Yt2 = _mm256_slli_epi32(Yt2, 24);
+ Yt0 = _mm256_max_epu8(Yt2, Yt0);
+ } else {
+ __m256i mask = _mm256_set1_epi32(0xff000000);
+ Yt0 = _mm256_or_si256(Yt0, mask);
+ }
+
+ for (int i = width; i < pixelWidth; i++) {
mtklein_C 2016/12/05 18:42:46 I'd have expected a call to _mm256_maskstore_epi32
xiangze.zhang 2016/12/07 11:17:01 I tried something like this: __m256i mask = _m
+ *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(Yt0));
+ __m256i rotate = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ Yt0 = _mm256_permutevar8x32_epi32(Yt0, rotate);
+ outRow += 4;
+ }
+ }
+ }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum, int r) {
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkOpts_hsw.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698