OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2013 Google Inc. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. |
| 6 */ |
| 7 |
| 8 #include "SkBitmapProcState.h" |
| 9 #include "SkBitmap.h" |
| 10 #include "SkColor.h" |
| 11 #include "SkColorPriv.h" |
| 12 #include "SkUnPreMultiply.h" |
| 13 #include "SkShader.h" |
| 14 |
| 15 #include <emmintrin.h> |
| 16 |
| 17 #define DS(x) SkDoubleToScalar(x) |
| 18 |
| 19 #define MUL(a, b) ((a) * (b)) |
| 20 |
| 21 static inline void print128i(__m128i value) { |
| 22 int *v = (int*) &value; |
| 23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |
| 24 } |
| 25 |
| 26 static inline void print128i_16(__m128i value) { |
| 27 short *v = (short*) &value; |
| 28 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
, v[3], v[4], v[5], v[6], v[7]); |
| 29 } |
| 30 |
| 31 static inline void print128i_8(__m128i value) { |
| 32 unsigned char *v = (unsigned char*) &value; |
| 33 printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3
u %.3u %.3u\n", |
| 34 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], |
| 35 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] |
| 36 ); |
| 37 } |
| 38 |
| 39 static inline void print128f(__m128 value) { |
| 40 float *f = (float*) &value; |
| 41 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |
| 42 } |
| 43 |
| 44 #define BLAH( s ) if (debug) { s; } |
| 45 |
| 46 // some SSE blending code adapted from theowl84's bilinear blog: |
| 47 // http://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.ht
ml |
| 48 |
| 49 #define WEIGHT_BITS 6 |
| 50 |
| 51 static inline SkPMColor cubicBlendSSE(__m128 coeffs, SkPMColor *p, bool debug=fa
lse) { |
| 52 |
| 53 BLAH( printf( "*(p+0) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+0)), Sk
GetPackedG32(*(p+0)), SkGetPackedB32(*(p+0)), SkGetPackedA32(*(p+0)) ) ) |
| 54 BLAH( printf( "*(p+1) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+1)), Sk
GetPackedG32(*(p+1)), SkGetPackedB32(*(p+1)), SkGetPackedA32(*(p+1)) ) ) |
| 55 BLAH( printf( "*(p+2) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+2)), Sk
GetPackedG32(*(p+2)), SkGetPackedB32(*(p+2)), SkGetPackedA32(*(p+2)) ) ) |
| 56 BLAH( printf( "*(p+3) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+3)), Sk
GetPackedG32(*(p+3)), SkGetPackedB32(*(p+3)), SkGetPackedA32(*(p+3)) ) ) |
| 57 |
| 58 __m128 p1 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+0) ) ); |
| 59 __m128 p2 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+1) ) ); |
| 60 __m128 p3 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+2) ) ); |
| 61 __m128 p4 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+3) ) ); |
| 62 |
| 63 BLAH( printf( "p1 = " ) ); |
| 64 BLAH( print128f( p1 )); |
| 65 BLAH( printf( "p2 = " ) ); |
| 66 BLAH( print128f( p1 )); |
| 67 BLAH( printf( "p3 = " ) ); |
| 68 BLAH( print128f( p1 )); |
| 69 BLAH( printf( "p4 = " ) ); |
| 70 BLAH( print128f( p1 )); |
| 71 |
| 72 |
| 73 |
| 74 return (SkPMColor) 0; |
| 75 |
| 76 } |
| 77 |
| 78 static __m128 build_coeff_sse(float t, bool debug=false) { |
| 79 static __m128 coeffs[4]; |
| 80 static const __m128 CONST_WEIGHT_FIXED = _mm_set1_ps(1 << WEIGHT_BITS); |
| 81 static bool once = true; |
| 82 if (once) { |
| 83 once = false; |
| 84 // note these coefficients are stored as the transpose of the ones in th
e |
| 85 // scalar CPU code so we can do all four dot products at once (SOA organ
ization) |
| 86 coeffs[0] = _mm_set_ps( 0.0f / 18.0f , 1.0f / 18.0f, 16.0f / 18.0f ,
1.0f / 18.0f); |
| 87 coeffs[1] = _mm_set_ps( 0.0f / 18.0f , 9.0f / 18.0f, 0.0f / 18.0f ,
-9.0f / 18.0f); |
| 88 coeffs[2] = _mm_set_ps( -6.0f / 18.0f , 27.0f / 18.0f, -36.0f / 18.0f ,
15.0f / 18.0f); |
| 89 coeffs[3] = _mm_set_ps( 7.0f / 18.0f ,-21.0f / 18.0f, 21.0f / 18.0f ,
-7.0f / 18.0f); |
| 90 }; |
| 91 |
| 92 // evaluate the cubic polynomial |
| 93 |
| 94 __m128 tvec = _mm_set1_ps( t ); |
| 95 __m128 t2vec = _mm_mul_ps( tvec, tvec ); |
| 96 __m128 t3vec = _mm_mul_ps( tvec, t2vec ); |
| 97 |
| 98 __m128 weight = _mm_mul_ps( t3vec, coeffs[3] ); |
| 99 weight = _mm_add_ps( weight, _mm_mul_ps( t2vec, coeffs[2])); |
| 100 weight = _mm_add_ps( weight, _mm_mul_ps( tvec, coeffs[1])); |
| 101 weight = _mm_add_ps( weight, coeffs[0] ); |
| 102 |
| 103 BLAH( printf( "t = %f\n", t ) ) |
| 104 BLAH( printf( "weight = " ) ) |
| 105 BLAH( print128f( weight ) ) |
| 106 |
| 107 return weight; |
| 108 } |
| 109 |
| 110 // because the border is handled specially, this is guaranteed to have all 16 pi
xels |
| 111 // available to it without running off the bitmap's edge. |
| 112 |
| 113 // the coefficients are split across two registers as prepared by the build_coef
f function above. |
| 114 static SkPMColor doBicubicFilterSSE(const SkBitmap *bm, |
| 115 __m128 coeffX, __m128 coeffY, |
| 116 int sx, int sy, |
| 117 bool debug=false ) |
| 118 { |
| 119 SkPMColor x_blends[4]; |
| 120 |
| 121 const int maxX = bm->width() - 1; |
| 122 const int maxY = bm->height() - 1; |
| 123 |
| 124 SkPMColor temp[4][4]; // used for special-casing the border |
| 125 SkPMColor *rows[4]; |
| 126 |
| 127 BLAH( printf( "in doBicubicFilterSSE with sx=%d, sy=%d\n", sx, sy)) |
| 128 |
| 129 if (sx <= 0 || sy <= 0 || sx >= maxX-2 || sy >= maxY-2) { |
| 130 // handle border specially |
| 131 int x0 = SkClampMax(sx - 1, maxX); |
| 132 int x1 = SkClampMax(sx , maxX); |
| 133 int x2 = SkClampMax(sx + 1, maxX); |
| 134 int x3 = SkClampMax(sx + 2, maxX); |
| 135 int y0 = SkClampMax(sy - 1, maxY); |
| 136 int y1 = SkClampMax(sy , maxY); |
| 137 int y2 = SkClampMax(sy + 1, maxY); |
| 138 int y3 = SkClampMax(sy + 2, maxY); |
| 139 temp[0][0] = *bm->getAddr32(x0, y0); |
| 140 temp[0][1] = *bm->getAddr32(x1, y0); |
| 141 temp[0][2] = *bm->getAddr32(x2, y0); |
| 142 temp[0][3] = *bm->getAddr32(x3, y0); |
| 143 temp[1][0] = *bm->getAddr32(x0, y1); |
| 144 temp[1][1] = *bm->getAddr32(x1, y1); |
| 145 temp[1][2] = *bm->getAddr32(x2, y1); |
| 146 temp[1][3] = *bm->getAddr32(x3, y1); |
| 147 temp[2][0] = *bm->getAddr32(x0, y2); |
| 148 temp[2][1] = *bm->getAddr32(x1, y2); |
| 149 temp[2][2] = *bm->getAddr32(x2, y2); |
| 150 temp[2][3] = *bm->getAddr32(x3, y2); |
| 151 temp[3][0] = *bm->getAddr32(x0, y3); |
| 152 temp[3][1] = *bm->getAddr32(x1, y3); |
| 153 temp[3][2] = *bm->getAddr32(x2, y3); |
| 154 temp[3][3] = *bm->getAddr32(x3, y3); |
| 155 |
| 156 rows[0] = temp[0]; |
| 157 rows[1] = temp[1]; |
| 158 rows[2] = temp[2]; |
| 159 rows[3] = temp[3]; |
| 160 } else { |
| 161 rows[0] = bm->getAddr32(sx-1,sy-1); |
| 162 rows[1] = bm->getAddr32(sx-1,sy-0); |
| 163 rows[2] = bm->getAddr32(sx-1,sy+1); |
| 164 rows[3] = bm->getAddr32(sx-1,sy+2); |
| 165 } |
| 166 |
| 167 x_blends[0] = cubicBlendSSE(coeffX, rows[0], debug); |
| 168 x_blends[1] = cubicBlendSSE(coeffX, rows[1], debug); |
| 169 x_blends[2] = cubicBlendSSE(coeffX, rows[2], debug); |
| 170 x_blends[3] = cubicBlendSSE(coeffX, rows[3], debug); |
| 171 |
| 172 return cubicBlendSSE(coeffY, x_blends, debug); |
| 173 } |
| 174 |
| 175 |
| 176 int debug_x = 20; |
| 177 int debug_y = 255; |
| 178 |
| 179 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, |
| 180 SkPMColor* SK_RESTRICT colors, int count) { |
| 181 |
| 182 SkPMColor *orig_colors = colors; |
| 183 |
| 184 while (count-- > 0) { |
| 185 SkPoint srcPt; |
| 186 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), |
| 187 SkIntToScalar(y), &srcPt); |
| 188 |
| 189 srcPt.fX -= SK_ScalarHalf; |
| 190 srcPt.fY -= SK_ScalarHalf; |
| 191 SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX); |
| 192 SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY); |
| 193 |
| 194 __m128 coeffX, coeffY; |
| 195 int sx = SkScalarFloorToInt(srcPt.fX); |
| 196 int sy = SkScalarFloorToInt(srcPt.fY); |
| 197 |
| 198 coeffX = build_coeff_sse(fractx); |
| 199 coeffY = build_coeff_sse(fracty); |
| 200 |
| 201 SkPMColor color = doBicubicFilterSSE( s.fBitmap, coeffX, coeffY, sx, sy,
1 ); |
| 202 SkPMColorAssert(color); |
| 203 *colors++ = color; |
| 204 x++; |
| 205 } |
| 206 |
| 207 } |
| 208 |
| 209 void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, |
| 210 SkPMColor *SK_RESTRICT colors, int count) { |
| 211 const int maxX = s.fBitmap->width() - 1; |
| 212 const int maxY = s.fBitmap->height() - 1; |
| 213 |
| 214 SkPoint srcPt; |
| 215 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), SkIntToScalar(y), &srcPt); |
| 216 srcPt.fY -= SK_ScalarHalf; |
| 217 SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY); |
| 218 __m128 coeffX, coeffY; |
| 219 coeffY = build_coeff_sse(fracty); |
| 220 int sy = SkScalarFloorToInt(srcPt.fY); |
| 221 |
| 222 while (count-- > 0) { |
| 223 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), SkIntToScalar(y), &srcPt); |
| 224 srcPt.fX -= SK_ScalarHalf; |
| 225 SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX); |
| 226 |
| 227 int sx = SkScalarFloorToInt(srcPt.fX); |
| 228 |
| 229 coeffX = build_coeff_sse(fractx); |
| 230 SkPMColor c = doBicubicFilterSSE( s.fBitmap, coeffX, coeffY, sx, sy ); |
| 231 SkPMColorAssert(c); |
| 232 *colors++ = c; |
| 233 |
| 234 x++; |
| 235 } |
| 236 } |
OLD | NEW |