| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
| 10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
| (...skipping 28 matching lines...) Expand all Loading... |
| 39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |
| 40 } | 40 } |
| 41 #endif | 41 #endif |
| 42 | 42 |
| 43 // because the border is handled specially, this is guaranteed to have all 16 pi
xels | 43 // because the border is handled specially, this is guaranteed to have all 16 pi
xels |
| 44 // available to it without running off the bitmap's edge. | 44 // available to it without running off the bitmap's edge. |
| 45 | 45 |
| 46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, | 46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, |
| 47 SkPMColor* SK_RESTRICT colors, int count) { | 47 SkPMColor* SK_RESTRICT colors, int count) { |
| 48 | 48 |
| 49 const int maxX = s.fBitmap->width() - 1; | 49 const int maxX = s.fBitmap->width(); |
| 50 const int maxY = s.fBitmap->height() - 1; | 50 const int maxY = s.fBitmap->height(); |
| 51 SkAutoTMalloc<SkScalar> xWeights(maxX); |
| 51 | 52 |
| 52 while (count-- > 0) { | 53 while (count-- > 0) { |
| 53 SkPoint srcPt; | 54 SkPoint srcPt; |
| 54 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), | 55 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); |
| 55 SkIntToScalar(y), &srcPt); | |
| 56 srcPt.fX -= SK_ScalarHalf; | 56 srcPt.fX -= SK_ScalarHalf; |
| 57 srcPt.fY -= SK_ScalarHalf; | 57 srcPt.fY -= SK_ScalarHalf; |
| 58 | 58 |
| 59 int sx = SkScalarFloorToInt(srcPt.fX); | |
| 60 int sy = SkScalarFloorToInt(srcPt.fY); | |
| 61 | |
| 62 __m128 weight = _mm_setzero_ps(); | 59 __m128 weight = _mm_setzero_ps(); |
| 63 __m128 accum = _mm_setzero_ps(); | 60 __m128 accum = _mm_setzero_ps(); |
| 64 | 61 |
| 65 int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f))); | 62 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->widt
h()), maxY); |
| 66 int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f))
); | 63 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->wid
th()+1), maxY); |
| 67 int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f))); | 64 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->widt
h()), maxX); |
| 68 int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f))
); | 65 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->wid
th())+1, maxX); |
| 69 | 66 |
| 70 for (int src_y = y0; src_y <= y1; src_y++) { | 67 for (int srcX = x0; srcX < x1 ; srcX++) { |
| 71 float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(sr
cPt.fY - src_y)); | 68 // Looking these up once instead of each loop is a ~15% speedup. |
| 69 xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX -
srcX)); |
| 70 } |
| 72 | 71 |
| 73 for (int src_x = x0; src_x <= x1 ; src_x++) { | 72 for (int srcY = y0; srcY < y1; srcY++) { |
| 74 float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScala
r(srcPt.fX - src_x)); | 73 SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - src
Y)); |
| 75 | 74 |
| 76 float combined_weight = xweight * yweight; | 75 for (int srcX = x0; srcX < x1 ; srcX++) { |
| 76 SkScalar xWeight = xWeights[srcX - x0]; |
| 77 | 77 |
| 78 SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y); | 78 SkScalar combined_weight = SkScalarMul(xWeight, yWeight); |
| 79 | 79 |
| 80 __m128i c = _mm_cvtsi32_si128( color ); | 80 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); |
| 81 |
| 82 __m128i c = _mm_cvtsi32_si128(color); |
| 81 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); | 83 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); |
| 82 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); | 84 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); |
| 83 | 85 __m128 cfloat = _mm_cvtepi32_ps(c); |
| 84 __m128 cfloat = _mm_cvtepi32_ps( c ); | |
| 85 | 86 |
| 86 __m128 weightVector = _mm_set1_ps(combined_weight); | 87 __m128 weightVector = _mm_set1_ps(combined_weight); |
| 87 | |
| 88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); | 88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); |
| 89 weight = _mm_add_ps( weight, weightVector ); | 89 weight = _mm_add_ps( weight, weightVector ); |
| 90 } | 90 } |
| 91 } | 91 } |
| 92 | 92 |
| 93 accum = _mm_div_ps(accum, weight); | 93 accum = _mm_div_ps(accum, weight); |
| 94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); | 94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); |
| 95 __m128i accumInt = _mm_cvttps_epi32(accum); |
| 96 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128()); |
| 97 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128()); |
| 98 SkPMColor c = _mm_cvtsi128_si32(accumInt); |
| 95 | 99 |
| 96 __m128i accumInt = _mm_cvtps_epi32( accum ); | 100 int a = SkClampMax(SkGetPackedA32(c), 255); |
| 97 | 101 int r = SkClampMax(SkGetPackedR32(c), a); |
| 98 int localResult[4]; | 102 int g = SkClampMax(SkGetPackedG32(c), a); |
| 99 _mm_storeu_si128((__m128i *) (localResult), accumInt); | 103 int b = SkClampMax(SkGetPackedB32(c), a); |
| 100 int a = SkClampMax(localResult[0], 255); | |
| 101 int r = SkClampMax(localResult[1], a); | |
| 102 int g = SkClampMax(localResult[2], a); | |
| 103 int b = SkClampMax(localResult[3], a); | |
| 104 | 104 |
| 105 *colors++ = SkPackARGB32(a, r, g, b); | 105 *colors++ = SkPackARGB32(a, r, g, b); |
| 106 | 106 |
| 107 x++; | 107 x++; |
| 108 } | 108 } |
| 109 } | 109 } |
| 110 | 110 |
| 111 // Convolves horizontally along a single row. The row data is given in | 111 // Convolves horizontally along a single row. The row data is given in |
| 112 // |src_data| and continues for the num_values() of the filter. | 112 // |src_data| and continues for the num_values() of the filter. |
| 113 void convolveHorizontally_SSE2(const unsigned char* src_data, | 113 void convolveHorizontally_SSE2(const unsigned char* src_data, |
| (...skipping 439 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
| 554 // Padding |paddingCount| of more dummy coefficients after the coefficients | 554 // Padding |paddingCount| of more dummy coefficients after the coefficients |
| 555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
| 556 // together to access invalid memory areas. We are not trying to align the | 556 // together to access invalid memory areas. We are not trying to align the |
| 557 // coefficients right now due to the opaqueness of <vector> implementation. | 557 // coefficients right now due to the opaqueness of <vector> implementation. |
| 558 // This has to be done after all |AddFilter| calls. | 558 // This has to be done after all |AddFilter| calls. |
| 559 for (int i = 0; i < 8; ++i) { | 559 for (int i = 0; i < 8; ++i) { |
| 560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
| 561 } | 561 } |
| 562 } | 562 } |
| OLD | NEW |