| Index: src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..eaba46b2348b9b15980cac38b44849ce679d1f2d
|
| --- /dev/null
|
| +++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
|
| @@ -0,0 +1,236 @@
|
| +/*
|
| + * Copyright 2013 Google Inc.
|
| + *
|
| + * Use of this source code is governed by a BSD-style license that can be
|
| + * found in the LICENSE file.
|
| + */
|
| +
|
| +#include "SkBitmapProcState.h"
|
| +#include "SkBitmap.h"
|
| +#include "SkColor.h"
|
| +#include "SkColorPriv.h"
|
| +#include "SkUnPreMultiply.h"
|
| +#include "SkShader.h"
|
| +
|
| +#include <emmintrin.h>
|
| +
|
| +#define DS(x) SkDoubleToScalar(x)
|
| +
|
| +#define MUL(a, b) ((a) * (b))
|
| +
|
| +static inline void print128i(__m128i value) {
|
| + int *v = (int*) &value;
|
| + printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
|
| +}
|
| +
|
| +static inline void print128i_16(__m128i value) {
|
| + short *v = (short*) &value;
|
| + printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
|
| +}
|
| +
|
| +static inline void print128i_8(__m128i value) {
|
| + unsigned char *v = (unsigned char*) &value;
|
| + printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
|
| + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
|
| + v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
|
| + );
|
| +}
|
| +
|
| +static inline void print128f(__m128 value) {
|
| + float *f = (float*) &value;
|
| + printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
|
| +}
|
| +
|
| +#define BLAH( s ) if (debug) { s; }
|
| +
|
| +// some SSE blending code adapted from theowl84's bilinear blog:
|
| +// http://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.html
|
| +
|
| +#define WEIGHT_BITS 6
|
| +
|
| +static inline SkPMColor cubicBlendSSE(__m128 coeffs, SkPMColor *p, bool debug=false) {
|
| +
|
| + BLAH( printf( "*(p+0) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+0)), SkGetPackedG32(*(p+0)), SkGetPackedB32(*(p+0)), SkGetPackedA32(*(p+0)) ) )
|
| + BLAH( printf( "*(p+1) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+1)), SkGetPackedG32(*(p+1)), SkGetPackedB32(*(p+1)), SkGetPackedA32(*(p+1)) ) )
|
| + BLAH( printf( "*(p+2) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+2)), SkGetPackedG32(*(p+2)), SkGetPackedB32(*(p+2)), SkGetPackedA32(*(p+2)) ) )
|
| + BLAH( printf( "*(p+3) = (%.3u %.3u %.3u %.3u)\n", SkGetPackedR32(*(p+3)), SkGetPackedG32(*(p+3)), SkGetPackedB32(*(p+3)), SkGetPackedA32(*(p+3)) ) )
|
| +
|
| + __m128 p1 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+0) ) );
|
| + __m128 p2 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+1) ) );
|
| + __m128 p3 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+2) ) );
|
| + __m128 p4 = _mm_cvtpu8_ps( _mm_set1_pi32( *(p+3) ) );
|
| +
|
| + BLAH( printf( "p1 = " ) );
|
| + BLAH( print128f( p1 ));
|
| + BLAH( printf( "p2 = " ) );
|
| + BLAH( print128f( p1 ));
|
| + BLAH( printf( "p3 = " ) );
|
| + BLAH( print128f( p1 ));
|
| + BLAH( printf( "p4 = " ) );
|
| + BLAH( print128f( p1 ));
|
| +
|
| +
|
| +
|
| + return (SkPMColor) 0;
|
| +
|
| +}
|
| +
|
| +static __m128 build_coeff_sse(float t, bool debug=false) {
|
| + static __m128 coeffs[4];
|
| + static const __m128 CONST_WEIGHT_FIXED = _mm_set1_ps(1 << WEIGHT_BITS);
|
| + static bool once = true;
|
| + if (once) {
|
| + once = false;
|
| + // note these coefficients are stored as the transpose of the ones in the
|
| + // scalar CPU code so we can do all four dot products at once (SOA organization)
|
| + coeffs[0] = _mm_set_ps( 0.0f / 18.0f , 1.0f / 18.0f, 16.0f / 18.0f , 1.0f / 18.0f);
|
| + coeffs[1] = _mm_set_ps( 0.0f / 18.0f , 9.0f / 18.0f, 0.0f / 18.0f ,-9.0f / 18.0f);
|
| + coeffs[2] = _mm_set_ps( -6.0f / 18.0f , 27.0f / 18.0f, -36.0f / 18.0f ,15.0f / 18.0f);
|
| + coeffs[3] = _mm_set_ps( 7.0f / 18.0f ,-21.0f / 18.0f, 21.0f / 18.0f ,-7.0f / 18.0f);
|
| + };
|
| +
|
| + // evaluate the cubic polynomial
|
| +
|
| + __m128 tvec = _mm_set1_ps( t );
|
| + __m128 t2vec = _mm_mul_ps( tvec, tvec );
|
| + __m128 t3vec = _mm_mul_ps( tvec, t2vec );
|
| +
|
| + __m128 weight = _mm_mul_ps( t3vec, coeffs[3] );
|
| + weight = _mm_add_ps( weight, _mm_mul_ps( t2vec, coeffs[2]));
|
| + weight = _mm_add_ps( weight, _mm_mul_ps( tvec, coeffs[1]));
|
| + weight = _mm_add_ps( weight, coeffs[0] );
|
| +
|
| + BLAH( printf( "t = %f\n", t ) )
|
| + BLAH( printf( "weight = " ) )
|
| + BLAH( print128f( weight ) )
|
| +
|
| + return weight;
|
| +}
|
| +
|
| +// because the border is handled specially, this is guaranteed to have all 16 pixels
|
| +// available to it without running off the bitmap's edge.
|
| +
|
| +// the coefficients are split across two registers as prepared by the build_coeff function above.
|
| +static SkPMColor doBicubicFilterSSE(const SkBitmap *bm,
|
| + __m128 coeffX, __m128 coeffY,
|
| + int sx, int sy,
|
| + bool debug=false )
|
| +{
|
| + SkPMColor x_blends[4];
|
| +
|
| + const int maxX = bm->width() - 1;
|
| + const int maxY = bm->height() - 1;
|
| +
|
| + SkPMColor temp[4][4]; // used for special-casing the border
|
| + SkPMColor *rows[4];
|
| +
|
| + BLAH( printf( "in doBicubicFilterSSE with sx=%d, sy=%d\n", sx, sy))
|
| +
|
| + if (sx <= 0 || sy <= 0 || sx >= maxX-2 || sy >= maxY-2) {
|
| + // handle border specially
|
| + int x0 = SkClampMax(sx - 1, maxX);
|
| + int x1 = SkClampMax(sx , maxX);
|
| + int x2 = SkClampMax(sx + 1, maxX);
|
| + int x3 = SkClampMax(sx + 2, maxX);
|
| + int y0 = SkClampMax(sy - 1, maxY);
|
| + int y1 = SkClampMax(sy , maxY);
|
| + int y2 = SkClampMax(sy + 1, maxY);
|
| + int y3 = SkClampMax(sy + 2, maxY);
|
| + temp[0][0] = *bm->getAddr32(x0, y0);
|
| + temp[0][1] = *bm->getAddr32(x1, y0);
|
| + temp[0][2] = *bm->getAddr32(x2, y0);
|
| + temp[0][3] = *bm->getAddr32(x3, y0);
|
| + temp[1][0] = *bm->getAddr32(x0, y1);
|
| + temp[1][1] = *bm->getAddr32(x1, y1);
|
| + temp[1][2] = *bm->getAddr32(x2, y1);
|
| + temp[1][3] = *bm->getAddr32(x3, y1);
|
| + temp[2][0] = *bm->getAddr32(x0, y2);
|
| + temp[2][1] = *bm->getAddr32(x1, y2);
|
| + temp[2][2] = *bm->getAddr32(x2, y2);
|
| + temp[2][3] = *bm->getAddr32(x3, y2);
|
| + temp[3][0] = *bm->getAddr32(x0, y3);
|
| + temp[3][1] = *bm->getAddr32(x1, y3);
|
| + temp[3][2] = *bm->getAddr32(x2, y3);
|
| + temp[3][3] = *bm->getAddr32(x3, y3);
|
| +
|
| + rows[0] = temp[0];
|
| + rows[1] = temp[1];
|
| + rows[2] = temp[2];
|
| + rows[3] = temp[3];
|
| + } else {
|
| + rows[0] = bm->getAddr32(sx-1,sy-1);
|
| + rows[1] = bm->getAddr32(sx-1,sy-0);
|
| + rows[2] = bm->getAddr32(sx-1,sy+1);
|
| + rows[3] = bm->getAddr32(sx-1,sy+2);
|
| + }
|
| +
|
| + x_blends[0] = cubicBlendSSE(coeffX, rows[0], debug);
|
| + x_blends[1] = cubicBlendSSE(coeffX, rows[1], debug);
|
| + x_blends[2] = cubicBlendSSE(coeffX, rows[2], debug);
|
| + x_blends[3] = cubicBlendSSE(coeffX, rows[3], debug);
|
| +
|
| + return cubicBlendSSE(coeffY, x_blends, debug);
|
| +}
|
| +
|
| +
|
| +int debug_x = 20;
|
| +int debug_y = 255;
|
| +
|
| +void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
|
| + SkPMColor* SK_RESTRICT colors, int count) {
|
| +
|
| + SkPMColor *orig_colors = colors;
|
| +
|
| + while (count-- > 0) {
|
| + SkPoint srcPt;
|
| + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x),
|
| + SkIntToScalar(y), &srcPt);
|
| +
|
| + srcPt.fX -= SK_ScalarHalf;
|
| + srcPt.fY -= SK_ScalarHalf;
|
| + SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX);
|
| + SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY);
|
| +
|
| + __m128 coeffX, coeffY;
|
| + int sx = SkScalarFloorToInt(srcPt.fX);
|
| + int sy = SkScalarFloorToInt(srcPt.fY);
|
| +
|
| + coeffX = build_coeff_sse(fractx);
|
| + coeffY = build_coeff_sse(fracty);
|
| +
|
| + SkPMColor color = doBicubicFilterSSE( s.fBitmap, coeffX, coeffY, sx, sy, 1 );
|
| + SkPMColorAssert(color);
|
| + *colors++ = color;
|
| + x++;
|
| + }
|
| +
|
| +}
|
| +
|
| +void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
|
| + SkPMColor *SK_RESTRICT colors, int count) {
|
| + const int maxX = s.fBitmap->width() - 1;
|
| + const int maxY = s.fBitmap->height() - 1;
|
| +
|
| + SkPoint srcPt;
|
| + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
|
| + srcPt.fY -= SK_ScalarHalf;
|
| + SkScalar fracty = srcPt.fY - SkScalarFloorToScalar(srcPt.fY);
|
| + __m128 coeffX, coeffY;
|
| + coeffY = build_coeff_sse(fracty);
|
| + int sy = SkScalarFloorToInt(srcPt.fY);
|
| +
|
| + while (count-- > 0) {
|
| + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
|
| + srcPt.fX -= SK_ScalarHalf;
|
| + SkScalar fractx = srcPt.fX - SkScalarFloorToScalar(srcPt.fX);
|
| +
|
| + int sx = SkScalarFloorToInt(srcPt.fX);
|
| +
|
| + coeffX = build_coeff_sse(fractx);
|
| + SkPMColor c = doBicubicFilterSSE( s.fBitmap, coeffX, coeffY, sx, sy );
|
| + SkPMColorAssert(c);
|
| + *colors++ = c;
|
| +
|
| + x++;
|
| + }
|
| +}
|
|
|