| Index: src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp
|
| diff --git a/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp b/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..53ce2ff8cc8e43de80b23e2bd24827eff6e4cb8d
|
| --- /dev/null
|
| +++ b/src/opts/SkMatrixConvolutionImageFilter_opts_AVX2.cpp
|
| @@ -0,0 +1,112 @@
|
| +/*
|
| + * Copyright 2016 Google Inc.
|
| + *
|
| + * Use of this source code is governed by a BSD-style license that can be
|
| + * found in the LICENSE file.
|
| + */
|
| +
|
| +#include "SkColor.h"
|
| +#include "SkBitmap.h"
|
| +#include "SkPoint.h"
|
| +#include "SkColorPriv.h"
|
| +
|
| +#include <immintrin.h>
|
| +
|
| +
|
| +namespace sk_avx2 {
|
| +
|
| +void SkMatrixConvolutionImageFilter_filterPixels_AVX2(
|
| + const SkBitmap& src,
|
| + SkBitmap* result,
|
| + const SkIRect& r,
|
| + const SkIRect& bounds,
|
| + bool convolveAlpha,
|
| + SkScalar* kernel,
|
| + const SkISize& kernelSize,
|
| + const SkIPoint& kernelOffset,
|
| + SkScalar gain,
|
| + SkScalar bias)
|
| +{
|
| + SkIRect rect(r);
|
| + if (!rect.intersect(bounds)) {
|
| + return;
|
| + }
|
| +
|
| + __m256i mask = _mm256_set_epi8(128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0,
|
| + 128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0);
|
| + __m256i mask_odd = _mm256_set_epi8(128,128,128,128, 128,128,128,128, 128,128,128,128, 128,128,128,128,
|
| + 128,128,128,3, 128,128,128,2, 128,128,128,1, 128,128,128,0);
|
| + int width_even = 2*(kernelSize.fWidth/2);
|
| +
|
| + for (int y = rect.fTop; y < rect.fBottom; ++y) {
|
| + SkPMColor* dptr = result->getAddr32(rect.fLeft - bounds.fLeft, y - bounds.fTop);
|
| + for (int x = rect.fLeft; x < rect.fRight; ++x) {
|
| + __m256 psum = _mm256_setzero_ps();
|
| + for (int cy = 0; cy < kernelSize.fHeight; cy++) {
|
| + int cx;
|
| + for (cx = 0; cx < width_even ; cx+=2) {
|
| + SkPMColor s1 = *src.getAddr32(
|
| + x + cx - kernelOffset.fX,
|
| + y + cy - kernelOffset.fY);
|
| + SkPMColor s2 = *src.getAddr32(
|
| + x + cx - kernelOffset.fX + 1,
|
| + y + cy - kernelOffset.fY);
|
| + __m256i ps = _mm256_set_epi32(0, 0, 0, s2, 0, 0, 0, s1);
|
| + __m256i ps_sh = _mm256_shuffle_epi8(ps, mask);
|
| + __m256 pss = _mm256_cvtepi32_ps(ps_sh);
|
| +
|
| + SkScalar k1 = kernel[cy * kernelSize.fWidth + cx];
|
| + SkScalar k2 = kernel[cy * kernelSize.fWidth + cx + 1];
|
| + __m256 pk = _mm256_set_ps(k2,k2,k2,k2, k1,k1,k1,k1);
|
| +
|
| + __m256 pmul = _mm256_mul_ps(pss, pk);
|
| +
|
| + psum = _mm256_add_ps(psum, pmul);
|
| + }
|
| + if (cx < kernelSize.fWidth) {
|
| + SkPMColor s1 = *src.getAddr32(
|
| + x + cx - kernelOffset.fX,
|
| + y + cy - kernelOffset.fY);
|
| +
|
| + __m256i ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s1);
|
| + __m256i ps_sh = _mm256_shuffle_epi8(ps, mask_odd);
|
| +
|
| + SkScalar k = kernel[cy * kernelSize.fWidth + cx];
|
| + __m256 pk = _mm256_set1_ps(k);
|
| +
|
| + __m256 pss = _mm256_cvtepi32_ps(ps_sh);
|
| + __m256 pmul = _mm256_mul_ps(pss, pk);
|
| +
|
| + psum = _mm256_add_ps(psum, pmul);
|
| + }
|
| + }
|
| +
|
| + union {
|
| + __m256 m256;
|
| + float f[8];
|
| + } conv = {psum};
|
| +
|
| + SkScalar sumA, sumR, sumG, sumB;
|
| +
|
| + sumA = conv.f[3] + conv.f[7];
|
| + sumR = conv.f[2] + conv.f[6];
|
| + sumG = conv.f[1] + conv.f[5];
|
| + sumB = conv.f[0] + conv.f[4];
|
| +
|
| + int a = convolveAlpha
|
| + ? SkClampMax(SkScalarFloorToInt(SkScalarMul(sumA, gain) + bias), 255)
|
| + : 255;
|
| + int r = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumR, gain) + bias), a);
|
| + int g = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumG, gain) + bias), a);
|
| + int b = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumB, gain) + bias), a);
|
| + if (!convolveAlpha) {
|
| + a = SkGetPackedA32(*src.getAddr32(x, y));
|
| + *dptr++ = SkPreMultiplyARGB(a, r, g, b);
|
| + } else {
|
| + *dptr++ = SkPackARGB32(a, r, g, b);
|
| + }
|
| + }
|
| + }
|
| +}
|
| +
|
| +}
|
|
|