Index: src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp |
diff --git a/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp b/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..53ce2ff8cc8e43de80b23e2bd24827eff6e4cb8d |
--- /dev/null |
+++ b/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp |
@@ -0,0 +1,112 @@ |
+/* |
+ * Copyright 2016 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#include "SkColor.h" |
+#include "SkBitmap.h" |
+#include "SkPoint.h" |
+#include "SkColorPriv.h" |
+ |
+#include <immintrin.h> |
+ |
+ |
+namespace sk_avx2 { |
+ |
+void SkMatrixConvolutionImageFilter_filterPixels_AVX2( |
+ const SkBitmap& src, |
+ SkBitmap* result, |
+ const SkIRect& r, |
+ const SkIRect& bounds, |
+ bool convolveAlpha, |
+ SkScalar* kernel, |
+ const SkISize& kernelSize, |
+ const SkIPoint& kernelOffset, |
+ SkScalar gain, |
+ SkScalar bias) |
+{ |
+ SkIRect rect(r); |
+ if (!rect.intersect(bounds)) { |
+ return; |
+ } |
+ |
+ __m256i mask = _mm256_set_epi8(128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0, |
+ 128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0); |
+ __m256i mask_odd = _mm256_set_epi8(128,128,128,128, 128,128,128,128, 128,128,128,128, 128,128,128,128, |
+ 128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0); |
+ int width_even = 2*(kernelSize.fWidth/2); |
+ |
+ for (int y = rect.fTop; y < rect.fBottom; ++y) { |
+ SkPMColor* dptr = result->getAddr32(rect.fLeft - bounds.fLeft, y - bounds.fTop); |
+ for (int x = rect.fLeft; x < rect.fRight; ++x) { |
+ __m256 psum = _mm256_setzero_ps(); |
+ for (int cy = 0; cy < kernelSize.fHeight; cy++) { |
+ int cx; |
+ for (cx = 0; cx < width_even ; cx+=2) { |
+ SkPMColor s1 = *src.getAddr32( |
+ x + cx - kernelOffset.fX, |
+ y + cy - kernelOffset.fY); |
+ SkPMColor s2 = *src.getAddr32( |
+ x + cx - kernelOffset.fX + 1, |
+ y + cy - kernelOffset.fY); |
+ __m256i ps = _mm256_set_epi32(0, 0, 0, s2, 0, 0, 0, s1); |
+ __m256i ps_sh = _mm256_shuffle_epi8(ps, mask); |
+ __m256 pss = _mm256_cvtepi32_ps(ps_sh); |
+ |
+ SkScalar k1 = kernel[cy * kernelSize.fWidth + cx]; |
+ SkScalar k2 = kernel[cy * kernelSize.fWidth + cx + 1]; |
+ __m256 pk = _mm256_set_ps(k2,k2,k2,k2, k1,k1,k1,k1); |
+ |
+ __m256 pmul = _mm256_mul_ps(pss, pk); |
+ |
+ psum = _mm256_add_ps(psum, pmul); |
+ } |
+ if (cx < kernelSize.fWidth) { |
+ SkPMColor s1 = *src.getAddr32( |
+ x + cx - kernelOffset.fX, |
+ y + cy - kernelOffset.fY); |
+ |
+ __m256i ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s1); |
+ __m256i ps_sh = _mm256_shuffle_epi8(ps, mask_odd); |
+ |
+ SkScalar k = kernel[cy * kernelSize.fWidth + cx]; |
+ __m256 pk = _mm256_set1_ps(k); |
+ |
+ __m256 pss = _mm256_cvtepi32_ps(ps_sh); |
+ __m256 pmul = _mm256_mul_ps(pss, pk); |
+ |
+ psum = _mm256_add_ps(psum, pmul); |
+ } |
+ } |
+ |
+ union { |
+ __m256 m256; |
+ float f[8]; |
+ } conv = {psum}; |
+ |
+ SkScalar sumA, sumR, sumG, sumB; |
+ |
+ sumA = conv.f[3] + conv.f[7]; |
+ sumR = conv.f[2] + conv.f[6]; |
+ sumG = conv.f[1] + conv.f[5]; |
+ sumB = conv.f[0] + conv.f[4]; |
+ |
+ int a = convolveAlpha |
+ ? SkClampMax(SkScalarFloorToInt(SkScalarMul(sumA, gain) + bias), 255) |
+ : 255; |
+ int r = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumR, gain) + bias), a); |
+ int g = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumG, gain) + bias), a); |
+ int b = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumB, gain) + bias), a); |
+ if (!convolveAlpha) { |
+ a = SkGetPackedA32(*src.getAddr32(x, y)); |
+ *dptr++ = SkPreMultiplyARGB(a, r, g, b); |
+ } else { |
+ *dptr++ = SkPackARGB32(a, r, g, b); |
+ } |
+ } |
+ } |
+} |
+ |
+} |