Index: src/opts/SkMatrixConvolutionImageFilter_opts_SSE.cpp |
diff --git a/src/opts/SkMatrixConvolutionImageFilter_opts_SSE.cpp b/src/opts/SkMatrixConvolutionImageFilter_opts_SSE.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..28b166d016d6402b47ff00337b707263f64fdfa7 |
--- /dev/null |
+++ b/src/opts/SkMatrixConvolutionImageFilter_opts_SSE.cpp |
@@ -0,0 +1,76 @@ |
+/* |
+ * Copyright 2016 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#include "SkColor.h" |
+#include "SkBitmap.h" |
+#include "SkPoint.h" |
+#include "SkColorPriv.h" |
+ |
+ |
+namespace sk_sse { |
+ |
+void SkMatrixConvolutionImageFilter_filterPixels_SSE(const SkBitmap& src, |
+ SkBitmap* result, |
+ const SkIRect& r, |
+ const SkIRect& bounds, |
+ bool convolveAlpha, |
+ SkScalar* kernel, |
+ const SkISize& kernelSize, |
+ const SkIPoint& kernelOffset, |
+ SkScalar gain, |
+ SkScalar bias) |
+{ |
+ SkIRect rect(r); |
+ if (!rect.intersect(bounds)) { |
+ return; |
+ } |
+ for (int y = rect.fTop; y < rect.fBottom; ++y) { |
+ SkPMColor* dptr = result->getAddr32(rect.fLeft - bounds.fLeft, y - bounds.fTop); |
+ for (int x = rect.fLeft; x < rect.fRight; ++x) { |
+ __m128 psum = _mm_setzero_ps(); |
+ for (int cy = 0; cy < kernelSize.fHeight; cy++) { |
+ for (int cx = 0; cx < kernelSize.fWidth; cx++) { |
+ SkPMColor s = *src.getAddr32( |
+ x + cx - kernelOffset.fX, |
+ y + cy - kernelOffset.fY); |
+ __m128 pss = _mm_cvtpu8_ps(_mm_set_pi32 (0, s)); |
+ |
+ __m128 pk = _mm_set1_ps(kernel[cy * kernelSize.fWidth + cx]); |
+ |
+ psum = _mm_add_ps(psum, _mm_mul_ps(pss, pk)); |
+ } |
+ } |
+ |
+ union { |
+ __m128 m128; |
+ float f[4]; |
+ } conv = {psum}; |
+ |
+ SkScalar sumA, sumR, sumG, sumB; |
+ |
+ sumA = conv.f[3]; |
+ sumR = conv.f[2]; |
+ sumG = conv.f[1]; |
+ sumB = conv.f[0]; |
+ |
+ int a = convolveAlpha |
+ ? SkClampMax(SkScalarFloorToInt(SkScalarMul(sumA, gain) + bias), 255) |
Stephen White
2016/04/12 20:01:59
Using Sk4/Sk4f might also allow us to benefit from
|
+ : 255; |
+ int r = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumR, gain) + bias), a); |
+ int g = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumG, gain) + bias), a); |
+ int b = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumB, gain) + bias), a); |
+ if (!convolveAlpha) { |
+ a = SkGetPackedA32(*src.getAddr32(x, y)); |
+ *dptr++ = SkPreMultiplyARGB(a, r, g, b); |
+ } else { |
+ *dptr++ = SkPackARGB32(a, r, g, b); |
+ } |
+ } |
+ } |
+} |
+ |
+} |