| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2013 Google Inc. | 2  * Copyright 2013 Google Inc. | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> | 
| 9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" | 
| 10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" | 
| (...skipping 28 matching lines...) Expand all  Loading... | 
| 39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 
| 40 } | 40 } | 
| 41 #endif | 41 #endif | 
| 42 | 42 | 
| 43 // because the border is handled specially, this is guaranteed to have all 16 pi
     xels | 43 // because the border is handled specially, this is guaranteed to have all 16 pi
     xels | 
| 44 // available to it without running off the bitmap's edge. | 44 // available to it without running off the bitmap's edge. | 
| 45 | 45 | 
| 46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, | 46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, | 
| 47                             SkPMColor* SK_RESTRICT colors, int count) { | 47                             SkPMColor* SK_RESTRICT colors, int count) { | 
| 48 | 48 | 
| 49     const int maxX = s.fBitmap->width() - 1; | 49     const int maxX = s.fBitmap->width(); | 
| 50     const int maxY = s.fBitmap->height() - 1; | 50     const int maxY = s.fBitmap->height(); | 
|  | 51     SkAutoTMalloc<SkScalar> xWeights(maxX); | 
| 51 | 52 | 
| 52     while (count-- > 0) { | 53     while (count-- > 0) { | 
| 53         SkPoint srcPt; | 54         SkPoint srcPt; | 
| 54         s.fInvProc(s.fInvMatrix, SkIntToScalar(x), | 55         s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); | 
| 55                     SkIntToScalar(y), &srcPt); |  | 
| 56         srcPt.fX -= SK_ScalarHalf; | 56         srcPt.fX -= SK_ScalarHalf; | 
| 57         srcPt.fY -= SK_ScalarHalf; | 57         srcPt.fY -= SK_ScalarHalf; | 
| 58 | 58 | 
| 59         int sx = SkScalarFloorToInt(srcPt.fX); |  | 
| 60         int sy = SkScalarFloorToInt(srcPt.fY); |  | 
| 61 |  | 
| 62         __m128 weight = _mm_setzero_ps(); | 59         __m128 weight = _mm_setzero_ps(); | 
| 63         __m128 accum = _mm_setzero_ps(); | 60         __m128 accum = _mm_setzero_ps(); | 
| 64 | 61 | 
| 65         int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f))); | 62         int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->widt
     h()), maxY); | 
| 66         int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f))
     ); | 63         int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->wid
     th()+1), maxY); | 
| 67         int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f))); | 64         int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->widt
     h()), maxX); | 
| 68         int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f))
     ); | 65         int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->wid
     th())+1, maxX); | 
| 69 | 66 | 
| 70         for (int src_y = y0; src_y <= y1; src_y++) { | 67         for (int srcX = x0; srcX < x1 ; srcX++) { | 
| 71             float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(sr
     cPt.fY - src_y)); | 68             // Looking these up once instead of each loop is a ~15% speedup. | 
|  | 69             xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - 
     srcX)); | 
|  | 70         } | 
| 72 | 71 | 
| 73             for (int src_x = x0; src_x <= x1 ; src_x++) { | 72         for (int srcY = y0; srcY < y1; srcY++) { | 
| 74                 float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScala
     r(srcPt.fX - src_x)); | 73             SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - src
     Y)); | 
| 75 | 74 | 
| 76                 float combined_weight = xweight * yweight; | 75             for (int srcX = x0; srcX < x1 ; srcX++) { | 
|  | 76                 SkScalar xWeight = xWeights[srcX - x0]; | 
| 77 | 77 | 
| 78                 SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y); | 78                 SkScalar combined_weight = SkScalarMul(xWeight, yWeight); | 
| 79 | 79 | 
| 80                 __m128i c = _mm_cvtsi32_si128( color ); | 80                 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); | 
|  | 81 | 
|  | 82                 __m128i c = _mm_cvtsi32_si128(color); | 
| 81                 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); | 83                 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); | 
| 82                 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); | 84                 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); | 
| 83 | 85                 __m128 cfloat = _mm_cvtepi32_ps(c); | 
| 84                 __m128 cfloat = _mm_cvtepi32_ps( c ); |  | 
| 85 | 86 | 
| 86                 __m128 weightVector = _mm_set1_ps(combined_weight); | 87                 __m128 weightVector = _mm_set1_ps(combined_weight); | 
| 87 |  | 
| 88                 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); | 88                 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); | 
| 89                 weight = _mm_add_ps( weight, weightVector ); | 89                 weight = _mm_add_ps( weight, weightVector ); | 
| 90             } | 90             } | 
| 91         } | 91         } | 
| 92 | 92 | 
| 93         accum = _mm_div_ps(accum, weight); | 93         accum = _mm_div_ps(accum, weight); | 
| 94         accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); | 94         accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); | 
|  | 95         __m128i accumInt = _mm_cvttps_epi32(accum); | 
|  | 96         accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128()); | 
|  | 97         accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128()); | 
|  | 98         SkPMColor c = _mm_cvtsi128_si32(accumInt); | 
| 95 | 99 | 
| 96         __m128i accumInt = _mm_cvtps_epi32( accum ); | 100         int a = SkClampMax(SkGetPackedA32(c), 255); | 
| 97 | 101         int r = SkClampMax(SkGetPackedR32(c), a); | 
| 98         int localResult[4]; | 102         int g = SkClampMax(SkGetPackedG32(c), a); | 
| 99         _mm_storeu_si128((__m128i *) (localResult), accumInt); | 103         int b = SkClampMax(SkGetPackedB32(c), a); | 
| 100         int a = SkClampMax(localResult[0], 255); |  | 
| 101         int r = SkClampMax(localResult[1], a); |  | 
| 102         int g = SkClampMax(localResult[2], a); |  | 
| 103         int b = SkClampMax(localResult[3], a); |  | 
| 104 | 104 | 
| 105         *colors++ = SkPackARGB32(a, r, g, b); | 105         *colors++ = SkPackARGB32(a, r, g, b); | 
| 106 | 106 | 
| 107         x++; | 107         x++; | 
| 108     } | 108     } | 
| 109 } | 109 } | 
| 110 | 110 | 
| 111 // Convolves horizontally along a single row. The row data is given in | 111 // Convolves horizontally along a single row. The row data is given in | 
| 112 // |src_data| and continues for the num_values() of the filter. | 112 // |src_data| and continues for the num_values() of the filter. | 
| 113 void convolveHorizontally_SSE2(const unsigned char* src_data, | 113 void convolveHorizontally_SSE2(const unsigned char* src_data, | 
| (...skipping 439 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 
| 554     // Padding |paddingCount| of more dummy coefficients after the coefficients | 554     // Padding |paddingCount| of more dummy coefficients after the coefficients | 
| 555     // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 555     // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 
| 556     // together to access invalid memory areas. We are not trying to align the | 556     // together to access invalid memory areas. We are not trying to align the | 
| 557     // coefficients right now due to the opaqueness of <vector> implementation. | 557     // coefficients right now due to the opaqueness of <vector> implementation. | 
| 558     // This has to be done after all |AddFilter| calls. | 558     // This has to be done after all |AddFilter| calls. | 
| 559     for (int i = 0; i < 8; ++i) { | 559     for (int i = 0; i < 8; ++i) { | 
| 560         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
     ed>(0)); | 560         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
     ed>(0)); | 
| 561     } | 561     } | 
| 562 } | 562 } | 
| OLD | NEW | 
|---|