OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
42 | 42 |
43 // because the border is handled specially, this is guaranteed to have all 16 pi
xels | 43 // because the border is handled specially, this is guaranteed to have all 16 pi
xels |
44 // available to it without running off the bitmap's edge. | 44 // available to it without running off the bitmap's edge. |
45 | 45 |
46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, | 46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, |
47 SkPMColor* SK_RESTRICT colors, int count) { | 47 SkPMColor* SK_RESTRICT colors, int count) { |
48 | 48 |
49 const int maxX = s.fBitmap->width(); | 49 const int maxX = s.fBitmap->width(); |
50 const int maxY = s.fBitmap->height(); | 50 const int maxY = s.fBitmap->height(); |
51 SkAutoTMalloc<SkScalar> xWeights(maxX); | 51 SkAutoTMalloc<SkScalar> xWeights(maxX); |
| 52 const SkBitmapFilter* filter = s.getBitmapFilter(); |
52 | 53 |
53 while (count-- > 0) { | 54 while (count-- > 0) { |
54 SkPoint srcPt; | 55 SkPoint srcPt; |
55 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); | 56 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); |
56 srcPt.fX -= SK_ScalarHalf; | 57 srcPt.fX -= SK_ScalarHalf; |
57 srcPt.fY -= SK_ScalarHalf; | 58 srcPt.fY -= SK_ScalarHalf; |
58 | 59 |
59 __m128 weight = _mm_setzero_ps(); | 60 __m128 weight = _mm_setzero_ps(); |
60 __m128 accum = _mm_setzero_ps(); | 61 __m128 accum = _mm_setzero_ps(); |
61 | 62 |
62 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->widt
h()), maxY); | 63 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY)
; |
63 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->wid
th()+1), maxY); | 64 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1),
maxY); |
64 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->widt
h()), maxX); | 65 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX)
; |
65 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->wid
th())+1, maxX); | 66 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1,
maxX); |
66 | 67 |
67 for (int srcX = x0; srcX < x1 ; srcX++) { | 68 for (int srcX = x0; srcX < x1 ; srcX++) { |
68 // Looking these up once instead of each loop is a ~15% speedup. | 69 // Looking these up once instead of each loop is a ~15% speedup. |
69 xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX -
srcX)); | 70 xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX)); |
70 } | 71 } |
71 | 72 |
72 for (int srcY = y0; srcY < y1; srcY++) { | 73 for (int srcY = y0; srcY < y1; srcY++) { |
73 SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - src
Y)); | 74 SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY)); |
74 | 75 |
75 for (int srcX = x0; srcX < x1 ; srcX++) { | 76 for (int srcX = x0; srcX < x1 ; srcX++) { |
76 SkScalar xWeight = xWeights[srcX - x0]; | 77 SkScalar xWeight = xWeights[srcX - x0]; |
77 | 78 |
78 SkScalar combined_weight = SkScalarMul(xWeight, yWeight); | 79 SkScalar combined_weight = SkScalarMul(xWeight, yWeight); |
| 80 __m128 weightVector = _mm_set1_ps(combined_weight); |
| 81 weight = _mm_add_ps( weight, weightVector ); |
79 | 82 |
80 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); | 83 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); |
| 84 if (!color) { |
| 85 continue; |
| 86 } |
81 | 87 |
82 __m128i c = _mm_cvtsi32_si128(color); | 88 __m128i c = _mm_cvtsi32_si128(color); |
83 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); | 89 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); |
84 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); | 90 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); |
85 __m128 cfloat = _mm_cvtepi32_ps(c); | 91 __m128 cfloat = _mm_cvtepi32_ps(c); |
86 | 92 |
87 __m128 weightVector = _mm_set1_ps(combined_weight); | |
88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); | 93 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); |
89 weight = _mm_add_ps( weight, weightVector ); | |
90 } | 94 } |
91 } | 95 } |
92 | 96 |
93 accum = _mm_div_ps(accum, weight); | 97 accum = _mm_div_ps(accum, weight); |
94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); | 98 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); |
95 __m128i accumInt = _mm_cvttps_epi32(accum); | 99 __m128i accumInt = _mm_cvttps_epi32(accum); |
96 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128()); | 100 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128()); |
97 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128()); | 101 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128()); |
98 SkPMColor c = _mm_cvtsi128_si32(accumInt); | 102 SkPMColor c = _mm_cvtsi128_si32(accumInt); |
99 | 103 |
(...skipping 453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 557 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
554 // Padding |paddingCount| of more dummy coefficients after the coefficients | 558 // Padding |paddingCount| of more dummy coefficients after the coefficients |
555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 559 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
556 // together to access invalid memory areas. We are not trying to align the | 560 // together to access invalid memory areas. We are not trying to align the |
557 // coefficients right now due to the opaqueness of <vector> implementation. | 561 // coefficients right now due to the opaqueness of <vector> implementation. |
558 // This has to be done after all |AddFilter| calls. | 562 // This has to be done after all |AddFilter| calls. |
559 for (int i = 0; i < 8; ++i) { | 563 for (int i = 0; i < 8; ++i) { |
560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 564 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
561 } | 565 } |
562 } | 566 } |
OLD | NEW |