OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2016 Google Inc. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. |
| 6 */ |
| 7 |
| 8 #include "SkColor.h" |
| 9 #include "SkBitmap.h" |
| 10 #include "SkPoint.h" |
| 11 #include "SkColorPriv.h" |
| 12 |
| 13 #include <immintrin.h> |
| 14 |
| 15 |
| 16 namespace sk_avx2 { |
| 17 |
| 18 void SkMatrixConvolutionImageFilter_filterPixels_AVX2( |
| 19 const SkBitmap& src, |
| 20 SkBitmap* result, |
| 21 const SkIRect& r, |
| 22 const SkIRect& bounds, |
| 23 bool convolveAlpha, |
| 24 SkScalar* kernel, |
| 25 const SkISize& kernelSize, |
| 26 const SkIPoint& kernelOffset, |
| 27 SkScalar gain, |
| 28 SkScalar bias) |
| 29 { |
| 30 SkIRect rect(r); |
| 31 if (!rect.intersect(bounds)) { |
| 32 return; |
| 33 } |
| 34 |
| 35 __m256i mask = _mm256_set_epi8(128,128,128,3, 128,128,128,2, 128,128,128,1,
128,128,128,0, |
| 36 128,128,128,3, 128,128,128,2, 128,128,128,1,
128,128,128,0); |
| 37 __m256i mask_odd = _mm256_set_epi8(128,128,128,128, 128,128,128,128, 128,128
,128,128, 128,128,128,128, |
| 38 128,128,128,3, 128,128,128,2, 128,128,128
,1, 128,128,128,0); |
| 39 int width_even = 2*(kernelSize.fWidth/2); |
| 40 |
| 41 for (int y = rect.fTop; y < rect.fBottom; ++y) { |
| 42 SkPMColor* dptr = result->getAddr32(rect.fLeft - bounds.fLeft, y - bound
s.fTop); |
| 43 for (int x = rect.fLeft; x < rect.fRight; ++x) { |
| 44 __m256 psum = _mm256_setzero_ps(); |
| 45 for (int cy = 0; cy < kernelSize.fHeight; cy++) { |
| 46 int cx; |
| 47 for (cx = 0; cx < width_even ; cx+=2) { |
| 48 SkPMColor s1 = *src.getAddr32( |
| 49 x + cx - kernelOffset.fX, |
| 50 y + cy - kernelOffset.fY); |
| 51 SkPMColor s2 = *src.getAddr32( |
| 52 x + cx - kernelOffset.fX + 1, |
| 53 y + cy - kernelOffset.fY); |
| 54 __m256i ps = _mm256_set_epi32(0, 0, 0, s2, 0, 0, 0, s1); |
| 55 __m256i ps_sh = _mm256_shuffle_epi8(ps, mask); |
| 56 __m256 pss = _mm256_cvtepi32_ps(ps_sh); |
| 57 |
| 58 SkScalar k1 = kernel[cy * kernelSize.fWidth + cx]; |
| 59 SkScalar k2 = kernel[cy * kernelSize.fWidth + cx + 1]; |
| 60 __m256 pk = _mm256_set_ps(k2,k2,k2,k2, k1,k1,k1,k1); |
| 61 |
| 62 __m256 pmul = _mm256_mul_ps(pss, pk); |
| 63 |
| 64 psum = _mm256_add_ps(psum, pmul); |
| 65 } |
| 66 if (cx < kernelSize.fWidth) { |
| 67 SkPMColor s1 = *src.getAddr32( |
| 68 x + cx - kernelOffset.fX, |
| 69 y + cy - kernelOffset.fY); |
| 70 |
| 71 __m256i ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s1); |
| 72 __m256i ps_sh = _mm256_shuffle_epi8(ps, mask_odd); |
| 73 |
| 74 SkScalar k = kernel[cy * kernelSize.fWidth + cx]; |
| 75 __m256 pk = _mm256_set1_ps(k); |
| 76 |
| 77 __m256 pss = _mm256_cvtepi32_ps(ps_sh); |
| 78 __m256 pmul = _mm256_mul_ps(pss, pk); |
| 79 |
| 80 psum = _mm256_add_ps(psum, pmul); |
| 81 } |
| 82 } |
| 83 |
| 84 union { |
| 85 __m256 m256; |
| 86 float f[8]; |
| 87 } conv = {psum}; |
| 88 |
| 89 SkScalar sumA, sumR, sumG, sumB; |
| 90 |
| 91 sumA = conv.f[3] + conv.f[7]; |
| 92 sumR = conv.f[2] + conv.f[6]; |
| 93 sumG = conv.f[1] + conv.f[5]; |
| 94 sumB = conv.f[0] + conv.f[4]; |
| 95 |
| 96 int a = convolveAlpha |
| 97 ? SkClampMax(SkScalarFloorToInt(SkScalarMul(sumA, gain) + bias),
255) |
| 98 : 255; |
| 99 int r = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumR, gain) + bias
), a); |
| 100 int g = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumG, gain) + bias
), a); |
| 101 int b = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumB, gain) + bias
), a); |
| 102 if (!convolveAlpha) { |
| 103 a = SkGetPackedA32(*src.getAddr32(x, y)); |
| 104 *dptr++ = SkPreMultiplyARGB(a, r, g, b); |
| 105 } else { |
| 106 *dptr++ = SkPackARGB32(a, r, g, b); |
| 107 } |
| 108 } |
| 109 } |
| 110 } |
| 111 |
| 112 } |
OLD | NEW |