OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
(...skipping 22 matching lines...) Expand all Loading... |
33 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] | 33 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] |
34 ); | 34 ); |
35 } | 35 } |
36 | 36 |
37 static inline void print128f(__m128 value) { | 37 static inline void print128f(__m128 value) { |
38 float *f = (float*) &value; | 38 float *f = (float*) &value; |
39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |
40 } | 40 } |
41 #endif | 41 #endif |
42 | 42 |
43 // because the border is handled specially, this is guaranteed to have all 16 pi
xels | |
44 // available to it without running off the bitmap's edge. | |
45 | |
46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, | |
47 SkPMColor* SK_RESTRICT colors, int count) { | |
48 | |
49 const int maxX = s.fBitmap->width(); | |
50 const int maxY = s.fBitmap->height(); | |
51 SkAutoTMalloc<SkScalar> xWeights(maxX); | |
52 const SkBitmapFilter* filter = s.getBitmapFilter(); | |
53 | |
54 while (count-- > 0) { | |
55 SkPoint srcPt; | |
56 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); | |
57 srcPt.fX -= SK_ScalarHalf; | |
58 srcPt.fY -= SK_ScalarHalf; | |
59 | |
60 __m128 weight = _mm_setzero_ps(); | |
61 __m128 accum = _mm_setzero_ps(); | |
62 | |
63 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY)
; | |
64 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1),
maxY); | |
65 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX)
; | |
66 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1,
maxX); | |
67 | |
68 for (int srcX = x0; srcX < x1 ; srcX++) { | |
69 // Looking these up once instead of each loop is a ~15% speedup. | |
70 xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX)); | |
71 } | |
72 | |
73 for (int srcY = y0; srcY < y1; srcY++) { | |
74 SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY)); | |
75 | |
76 for (int srcX = x0; srcX < x1 ; srcX++) { | |
77 SkScalar xWeight = xWeights[srcX - x0]; | |
78 | |
79 SkScalar combined_weight = SkScalarMul(xWeight, yWeight); | |
80 __m128 weightVector = _mm_set1_ps(combined_weight); | |
81 weight = _mm_add_ps( weight, weightVector ); | |
82 | |
83 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); | |
84 if (!color) { | |
85 continue; | |
86 } | |
87 | |
88 __m128i c = _mm_cvtsi32_si128(color); | |
89 c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); | |
90 c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); | |
91 __m128 cfloat = _mm_cvtepi32_ps(c); | |
92 | |
93 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); | |
94 } | |
95 } | |
96 | |
97 accum = _mm_div_ps(accum, weight); | |
98 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); | |
99 __m128i accumInt = _mm_cvttps_epi32(accum); | |
100 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128()); | |
101 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128()); | |
102 SkPMColor c = _mm_cvtsi128_si32(accumInt); | |
103 | |
104 int a = SkClampMax(SkGetPackedA32(c), 255); | |
105 int r = SkClampMax(SkGetPackedR32(c), a); | |
106 int g = SkClampMax(SkGetPackedG32(c), a); | |
107 int b = SkClampMax(SkGetPackedB32(c), a); | |
108 | |
109 *colors++ = SkPackARGB32(a, r, g, b); | |
110 | |
111 x++; | |
112 } | |
113 } | |
114 | |
115 // Convolves horizontally along a single row. The row data is given in | 43 // Convolves horizontally along a single row. The row data is given in |
116 // |src_data| and continues for the num_values() of the filter. | 44 // |src_data| and continues for the num_values() of the filter. |
117 void convolveHorizontally_SSE2(const unsigned char* src_data, | 45 void convolveHorizontally_SSE2(const unsigned char* src_data, |
118 const SkConvolutionFilter1D& filter, | 46 const SkConvolutionFilter1D& filter, |
119 unsigned char* out_row, | 47 unsigned char* out_row, |
120 bool /*has_alpha*/) { | 48 bool /*has_alpha*/) { |
121 int num_values = filter.numValues(); | 49 int num_values = filter.numValues(); |
122 | 50 |
123 int filter_offset, filter_length; | 51 int filter_offset, filter_length; |
124 __m128i zero = _mm_setzero_si128(); | 52 __m128i zero = _mm_setzero_si128(); |
(...skipping 432 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
557 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 485 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
558 // Padding |paddingCount| of more dummy coefficients after the coefficients | 486 // Padding |paddingCount| of more dummy coefficients after the coefficients |
559 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 487 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
560 // together to access invalid memory areas. We are not trying to align the | 488 // together to access invalid memory areas. We are not trying to align the |
561 // coefficients right now due to the opaqueness of <vector> implementation. | 489 // coefficients right now due to the opaqueness of <vector> implementation. |
562 // This has to be done after all |AddFilter| calls. | 490 // This has to be done after all |AddFilter| calls. |
563 for (int i = 0; i < 8; ++i) { | 491 for (int i = 0; i < 8; ++i) { |
564 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 492 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
565 } | 493 } |
566 } | 494 } |
OLD | NEW |