Index: src/opts/SkBitmapFilter_opts_SSE2.cpp |
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..1884f67f52f7f2f171c1ff94c979947d3e7c0771 |
--- /dev/null |
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp |
@@ -0,0 +1,189 @@ |
+/* |
+ * Copyright 2013 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#include "SkBitmapProcState.h" |
+#include "SkBitmap.h" |
+#include "SkColor.h" |
+#include "SkColorPriv.h" |
+#include "SkUnPreMultiply.h" |
+#include "SkShader.h" |
+ |
+#include <emmintrin.h> |
+ |
+#if 0 |
+static inline void print128i(__m128i value) { |
+ int *v = (int*) &value; |
+ printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |
+} |
+ |
+static inline void print128i_16(__m128i value) { |
+ short *v = (short*) &value; |
+ printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); |
+} |
+ |
+static inline void print128i_8(__m128i value) { |
+ unsigned char *v = (unsigned char*) &value; |
+ printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n", |
+ v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], |
+ v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] |
+ ); |
+} |
+ |
+static inline void print128f(__m128 value) { |
+ float *f = (float*) &value; |
+ printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |
+} |
+#endif |
+ |
+// because the border is handled specially, this is guaranteed to have all 16 pixels |
+// available to it without running off the bitmap's edge. |
+ |
+int debug_x = 20; |
+int debug_y = 255; |
+ |
+void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, |
+ SkPMColor* SK_RESTRICT colors, int count) { |
+ |
+ SkPMColor *orig_colors = colors; |
+ |
+ const int maxX = s.fBitmap->width() - 1; |
+ const int maxY = s.fBitmap->height() - 1; |
+ |
+ while (count-- > 0) { |
+ SkPoint srcPt; |
+ s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), |
+ SkIntToScalar(y), &srcPt); |
+ srcPt.fX -= SK_ScalarHalf; |
+ srcPt.fY -= SK_ScalarHalf; |
+ SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX); |
+ SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY); |
+ |
+ int sx = SkScalarFloorToInt(srcPt.fX); |
+ int sy = SkScalarFloorToInt(srcPt.fY); |
+ |
+ __m128 weight = _mm_setzero_ps(); |
+ __m128 accum = _mm_setzero_ps(); |
+ |
+ int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f))); |
+ int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f))); |
+ int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f))); |
+ int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f))); |
+ |
+ for (int src_y = y0; src_y <= y1; src_y++) { |
+ float yweight = s.getBitmapFilter()->lookupFloat( (srcPt.fY - src_y) ); |
+ |
+ for (int src_x = x0; src_x <= x1 ; src_x++) { |
+ float xweight = s.getBitmapFilter()->lookupFloat( (srcPt.fX - src_x) ); |
+ |
+ float combined_weight = xweight * yweight; |
+ |
+ SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y); |
+ |
+ __m128i c = _mm_cvtsi32_si128( color ); |
+ c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); |
+ c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); |
+ |
+ __m128 cfloat = _mm_cvtepi32_ps( c ); |
+ |
+ __m128 weightVector = _mm_set1_ps(combined_weight); |
+ |
+ accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); |
+ weight = _mm_add_ps( weight, weightVector ); |
+ } |
+ } |
+ |
+ accum = _mm_div_ps(accum, weight); |
+ accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); |
+ |
+ __m128i accumInt = _mm_cvtps_epi32( accum ); |
+ |
+ int localResult[4]; |
+ _mm_storeu_si128((__m128i *) (localResult), accumInt); |
+ int a = SkClampMax(localResult[0], 255); |
+ int r = SkClampMax(localResult[1], a); |
+ int g = SkClampMax(localResult[2], a); |
+ int b = SkClampMax(localResult[3], a); |
+ |
+ *colors++ = SkPackARGB32(a, r, g, b); |
+ |
+ x++; |
+ } |
+} |
+ |
+void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, |
+ SkPMColor *SK_RESTRICT colors, int count) { |
+ SkPMColor *orig_colors = colors; |
+ |
+ const int maxX = s.fBitmap->width() - 1; |
+ const int maxY = s.fBitmap->height() - 1; |
+ |
+ SkPoint srcPt; |
+ s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), |
+ SkIntToScalar(y), &srcPt); |
+ srcPt.fY -= SK_ScalarHalf; |
+ int sy = SkScalarFloorToInt(srcPt.fY); |
+ |
+ SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY); |
+ |
+ int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f))); |
+ int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f))); |
+ |
+ while (count-- > 0) { |
+ srcPt.fX -= SK_ScalarHalf; |
+ srcPt.fY -= SK_ScalarHalf; |
+ SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX); |
+ |
+ int sx = SkScalarFloorToInt(srcPt.fX); |
+ |
+ float weight = 0; |
+ __m128 accum = _mm_setzero_ps(); |
+ |
+ int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f))); |
+ int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f))); |
+ |
+ for (int src_y = y0; src_y <= y1; src_y++) { |
+ float yweight = s.getBitmapFilter()->lookupFloat( (srcPt.fY - src_y) ); |
+ |
+ for (int src_x = x0; src_x <= x1 ; src_x++) { |
+ float xweight = s.getBitmapFilter()->lookupFloat( (srcPt.fX - src_x) ); |
+ |
+ float combined_weight = xweight * yweight; |
+ |
+ SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y); |
+ |
+ __m128 c = _mm_set_ps(SkGetPackedB32(color), |
+ SkGetPackedG32(color), |
+ SkGetPackedR32(color), |
+ SkGetPackedA32(color)); |
+ |
+ __m128 weightVector = _mm_set1_ps(combined_weight); |
+ |
+ accum = _mm_add_ps(accum, _mm_mul_ps(c, weightVector)); |
+ weight += combined_weight; |
+ } |
+ } |
+ |
+ __m128 totalWeightVector = _mm_set1_ps(weight); |
+ accum = _mm_div_ps(accum, totalWeightVector); |
+ accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); |
+ |
+ float localResult[4]; |
+ _mm_storeu_ps(localResult, accum); |
+ int a = SkClampMax(int(localResult[0]), 255); |
+ int r = SkClampMax(int(localResult[1]), a); |
+ int g = SkClampMax(int(localResult[2]), a); |
+ int b = SkClampMax(int(localResult[3]), a); |
+ |
+ *colors++ = SkPackARGB32(a, r, g, b); |
+ |
+ x++; |
+ |
+ s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), |
+ SkIntToScalar(y), &srcPt); |
+ |
+ } |
+} |