| Index: src/opts/SkBitmapProcState_matrix_neon.h
|
| diff --git a/src/opts/SkBitmapProcState_matrix_neon.h b/src/opts/SkBitmapProcState_matrix_neon.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..bc1b59f204be06f319a5b93a09c9a17959710864
|
| --- /dev/null
|
| +++ b/src/opts/SkBitmapProcState_matrix_neon.h
|
| @@ -0,0 +1,504 @@
|
| +
|
| +#include <arm_neon.h>
|
| +
|
| +
|
| +#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
|
| +#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
|
| +#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
|
| +#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
|
| +#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
|
| +#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
|
| +
|
| +#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
|
| +#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
|
| +#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
|
| +#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
|
| +
|
| +#ifndef PREAMBLE
|
| + #define PREAMBLE(state)
|
| + #define PREAMBLE_PARAM_X
|
| + #define PREAMBLE_PARAM_Y
|
| + #define PREAMBLE_ARG_X
|
| + #define PREAMBLE_ARG_Y
|
| +#endif
|
| +
|
| +static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t xy[], int count, int x, int y) {
|
| + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
|
| + SkMatrix::kScale_Mask)) == 0);
|
| +
|
| + PREAMBLE(s);
|
| +
|
| + // we store y, x, x, x, x, x
|
| + const unsigned maxX = s.fBitmap->width() - 1;
|
| + SkFixed fx;
|
| + {
|
| + SkPoint pt;
|
| + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, &pt);
|
| + fx = SkScalarToFixed(pt.fY);
|
| + const unsigned maxY = s.fBitmap->height() - 1;
|
| + *xy++ = TILEY_PROCF(fx, maxY);
|
| + fx = SkScalarToFixed(pt.fX);
|
| + }
|
| +
|
| + if (0 == maxX) {
|
| + // all of the following X values must be 0
|
| + memset(xy, 0, count * sizeof(uint16_t));
|
| + return;
|
| + }
|
| +
|
| + const SkFixed dx = s.fInvSx;
|
| +
|
| +#ifdef CHECK_FOR_DECAL
|
| + // test if we don't need to apply the tile proc
|
| + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
|
| + decal_nofilter_scale_neon(xy, fx, dx, count);
|
| + return;
|
| + }
|
| +#endif
|
| +
|
| + if (count >= 8) {
|
| + SkFixed dx2 = dx+dx;
|
| + SkFixed dx4 = dx2+dx2;
|
| + SkFixed dx8 = dx4+dx4;
|
| +
|
| + // now build fx/fx+dx/fx+2dx/fx+3dx
|
| + SkFixed fx1, fx2, fx3;
|
| + int32x4_t lbase, hbase;
|
| + int16_t *dst16 = (int16_t *)xy;
|
| +
|
| + fx1 = fx+dx;
|
| + fx2 = fx1+dx;
|
| + fx3 = fx2+dx;
|
| +
|
| + lbase = vdupq_n_s32(fx);
|
| + lbase = vsetq_lane_s32(fx1, lbase, 1);
|
| + lbase = vsetq_lane_s32(fx2, lbase, 2);
|
| + lbase = vsetq_lane_s32(fx3, lbase, 3);
|
| + hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
|
| +
|
| + // store & bump
|
| + while (count >= 8) {
|
| +
|
| + int16x8_t fx8;
|
| +
|
| + fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
|
| +
|
| + vst1q_s16(dst16, fx8);
|
| +
|
| + // but preserving base & on to the next
|
| + lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
|
| + hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
|
| + dst16 += 8;
|
| + count -= 8;
|
| + fx += dx8;
|
| + };
|
| + xy = (uint32_t *) dst16;
|
| + }
|
| +
|
| + uint16_t* xx = (uint16_t*)xy;
|
| + for (int i = count; i > 0; --i) {
|
| + *xx++ = TILEX_PROCF(fx, maxX);
|
| + fx += dx;
|
| + }
|
| +}
|
| +
|
| +static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t xy[], int count, int x, int y) {
|
| + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
|
| + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
|
| + SkMatrix::kScale_Mask |
|
| + SkMatrix::kAffine_Mask)) == 0);
|
| +
|
| + PREAMBLE(s);
|
| + SkPoint srcPt;
|
| + s.fInvProc(s.fInvMatrix,
|
| + SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
|
| +
|
| + SkFixed fx = SkScalarToFixed(srcPt.fX);
|
| + SkFixed fy = SkScalarToFixed(srcPt.fY);
|
| + SkFixed dx = s.fInvSx;
|
| + SkFixed dy = s.fInvKy;
|
| + int maxX = s.fBitmap->width() - 1;
|
| + int maxY = s.fBitmap->height() - 1;
|
| +
|
| + if (count >= 8) {
|
| + SkFixed dx4 = dx * 4;
|
| + SkFixed dy4 = dy * 4;
|
| + SkFixed dx8 = dx * 8;
|
| + SkFixed dy8 = dy * 8;
|
| +
|
| + int32x4_t xbase, ybase;
|
| + int32x4_t x2base, y2base;
|
| + int16_t *dst16 = (int16_t *) xy;
|
| +
|
| + // now build fx, fx+dx, fx+2dx, fx+3dx
|
| + xbase = vdupq_n_s32(fx);
|
| + xbase = vsetq_lane_s32(fx+dx, xbase, 1);
|
| + xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
|
| + xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
|
| +
|
| + // same for fy
|
| + ybase = vdupq_n_s32(fy);
|
| + ybase = vsetq_lane_s32(fy+dy, ybase, 1);
|
| + ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
|
| + ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
|
| +
|
| + x2base = vaddq_s32(xbase, vdupq_n_s32(dx4));
|
| + y2base = vaddq_s32(ybase, vdupq_n_s32(dy4));
|
| +
|
| + // store & bump
|
| + do {
|
| + int16x8x2_t hi16;
|
| +
|
| + hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
|
| + hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
|
| +
|
| + vst2q_s16(dst16, hi16);
|
| +
|
| + // moving base and on to the next
|
| + xbase = vaddq_s32(xbase, vdupq_n_s32(dx8));
|
| + ybase = vaddq_s32(ybase, vdupq_n_s32(dy8));
|
| + x2base = vaddq_s32(x2base, vdupq_n_s32(dx8));
|
| + y2base = vaddq_s32(y2base, vdupq_n_s32(dy8));
|
| +
|
| + dst16 += 16; // 8x32 aka 16x16
|
| + count -= 8;
|
| + fx += dx8;
|
| + fy += dy8;
|
| + } while (count >= 8);
|
| + xy = (uint32_t *) dst16;
|
| + }
|
| +
|
| + for (int i = count; i > 0; --i) {
|
| + *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
|
| + fx += dx; fy += dy;
|
| + }
|
| +}
|
| +
|
| +static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t* SK_RESTRICT xy,
|
| + int count, int x, int y) {
|
| + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
|
| +
|
| + PREAMBLE(s);
|
| + // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
|
| + int maxX = s.fBitmap->width() - 1;
|
| + int maxY = s.fBitmap->height() - 1;
|
| +
|
| + SkPerspIter iter(s.fInvMatrix,
|
| + SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, count);
|
| +
|
| + while ((count = iter.next()) != 0) {
|
| + const SkFixed* SK_RESTRICT srcXY = iter.getXY();
|
| +
|
| + if (count >= 8) {
|
| + int32_t *mysrc = (int32_t *) srcXY;
|
| + int16_t *mydst = (int16_t *) xy;
|
| + do {
|
| + int16x8x2_t hi16;
|
| + int32x4x2_t xy1, xy2;
|
| +
|
| + xy1 = vld2q_s32(mysrc);
|
| + xy2 = vld2q_s32(mysrc+8);
|
| +
|
| + hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
|
| + hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
|
| +
|
| + vst2q_s16(mydst, hi16);
|
| +
|
| + count -= 8; // 8 iterations
|
| + mysrc += 16; // 16 longs
|
| + mydst += 16; // 16 shorts, aka 8 longs
|
| + } while (count >= 8);
|
| + // get xy and srcXY fixed up
|
| + srcXY = (const SkFixed *) mysrc;
|
| + xy = (uint32_t *) mydst;
|
| + }
|
| +
|
| + while (--count >= 0) {
|
| + *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
|
| + TILEX_PROCF(srcXY[0], maxX);
|
| + srcXY += 2;
|
| + }
|
| + }
|
| +}
|
| +
|
| +static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
|
| + SkFixed one PREAMBLE_PARAM_Y) {
|
| + unsigned i = TILEY_PROCF(f, max);
|
| + i = (i << 4) | TILEY_LOW_BITS(f, max);
|
| + return (i << 14) | (TILEY_PROCF((f + one), max));
|
| +}
|
| +
|
| +static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
|
| + SkFixed one PREAMBLE_PARAM_X) {
|
| + unsigned i = TILEX_PROCF(f, max);
|
| + i = (i << 4) | TILEX_LOW_BITS(f, max);
|
| + return (i << 14) | (TILEX_PROCF((f + one), max));
|
| +}
|
| +
|
| +static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
|
| + SkFixed one PREAMBLE_PARAM_X) {
|
| + int32x4_t ret, res, wide_one;
|
| +
|
| + // Prepare constants
|
| + wide_one = vdupq_n_s32(one);
|
| +
|
| + // Step 1
|
| + res = TILEX_PROCF_NEON4(f, max);
|
| +
|
| + // Step 2
|
| + ret = TILEX_LOW_BITS_NEON4(f, max);
|
| + ret = vsliq_n_s32(ret, res, 4);
|
| +
|
| + // Step 3
|
| + res = TILEX_PROCF_NEON4(f + wide_one, max);
|
| + ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
|
| +
|
| + return ret;
|
| +}
|
| +
|
| +static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
|
| + SkFixed one PREAMBLE_PARAM_X) {
|
| + int32x4_t ret, res, wide_one;
|
| +
|
| + // Prepare constants
|
| + wide_one = vdupq_n_s32(one);
|
| +
|
| + // Step 1
|
| + res = TILEY_PROCF_NEON4(f, max);
|
| +
|
| + // Step 2
|
| + ret = TILEY_LOW_BITS_NEON4(f, max);
|
| + ret = vsliq_n_s32(ret, res, 4);
|
| +
|
| + // Step 3
|
| + res = TILEY_PROCF_NEON4(f + wide_one, max);
|
| + ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
|
| +
|
| + return ret;
|
| +}
|
| +
|
| +static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t xy[], int count, int x, int y) {
|
| + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
|
| + SkMatrix::kScale_Mask)) == 0);
|
| + SkASSERT(s.fInvKy == 0);
|
| +
|
| + PREAMBLE(s);
|
| +
|
| + const unsigned maxX = s.fBitmap->width() - 1;
|
| + const SkFixed one = s.fFilterOneX;
|
| + const SkFixed dx = s.fInvSx;
|
| + SkFixed fx;
|
| +
|
| + {
|
| + SkPoint pt;
|
| + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, &pt);
|
| + const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
|
| + const unsigned maxY = s.fBitmap->height() - 1;
|
| + // compute our two Y values up front
|
| + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
|
| + // now initialize fx
|
| + fx = SkScalarToFixed(pt.fX) - (one >> 1);
|
| + }
|
| +
|
| +#ifdef CHECK_FOR_DECAL
|
| + // test if we don't need to apply the tile proc
|
| + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
|
| + decal_filter_scale_neon(xy, fx, dx, count);
|
| + return;
|
| + }
|
| +#endif
|
| + {
|
| +
|
| + if (count >= 4) {
|
| + int32x4_t wide_fx;
|
| +
|
| + wide_fx = vdupq_n_s32(fx);
|
| + wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
|
| + wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
|
| + wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
|
| +
|
| + while (count >= 4) {
|
| + int32x4_t res;
|
| +
|
| + res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
|
| +
|
| + vst1q_u32(xy, vreinterpretq_u32_s32(res));
|
| +
|
| + wide_fx += vdupq_n_s32(dx+dx+dx+dx);
|
| + fx += dx+dx+dx+dx;
|
| + xy += 4;
|
| + count -= 4;
|
| + }
|
| + }
|
| +
|
| + while (--count >= 0) {
|
| + *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X);
|
| + fx += dx;
|
| + }
|
| +
|
| + }
|
| +}
|
| +
|
| +static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t xy[], int count, int x, int y) {
|
| + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
|
| + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
|
| + SkMatrix::kScale_Mask |
|
| + SkMatrix::kAffine_Mask)) == 0);
|
| +
|
| + PREAMBLE(s);
|
| + SkPoint srcPt;
|
| + s.fInvProc(s.fInvMatrix,
|
| + SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
|
| +
|
| + SkFixed oneX = s.fFilterOneX;
|
| + SkFixed oneY = s.fFilterOneY;
|
| + SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
|
| + SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
|
| + SkFixed dx = s.fInvSx;
|
| + SkFixed dy = s.fInvKy;
|
| + unsigned maxX = s.fBitmap->width() - 1;
|
| + unsigned maxY = s.fBitmap->height() - 1;
|
| +
|
| + if (count >= 4) {
|
| + int32x4_t wide_fy, wide_fx;
|
| +
|
| + wide_fx = vdupq_n_s32(fx);
|
| + wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
|
| + wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
|
| + wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
|
| +
|
| + wide_fy = vdupq_n_s32(fy);
|
| + wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
|
| + wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
|
| + wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
|
| +
|
| + while (count >= 4) {
|
| + int32x4x2_t vxy;
|
| +
|
| + // do the X side, then the Y side, then interleave them
|
| + vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
|
| + vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
|
| +
|
| + // interleave as YXYXYXYX as part of the storing
|
| + vst2q_s32((int32_t*)xy, vxy);
|
| +
|
| + // prepare next iteration
|
| + wide_fx += vdupq_n_s32(dx+dx+dx+dx);
|
| + fx += dx + dx + dx + dx;
|
| + wide_fy += vdupq_n_s32(dy+dy+dy+dy);
|
| + fy += dy+dy+dy+dy;
|
| + xy += 8; // 4 x's, 4 y's
|
| + count -= 4;
|
| + }
|
| + }
|
| +
|
| + while (--count >= 0) {
|
| + // NB: writing Y/X
|
| + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
|
| + fy += dy;
|
| + *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
|
| + fx += dx;
|
| + }
|
| +}
|
| +
|
| +static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
|
| + uint32_t* SK_RESTRICT xy, int count,
|
| + int x, int y) {
|
| + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
|
| +
|
| + PREAMBLE(s);
|
| + unsigned maxX = s.fBitmap->width() - 1;
|
| + unsigned maxY = s.fBitmap->height() - 1;
|
| + SkFixed oneX = s.fFilterOneX;
|
| + SkFixed oneY = s.fFilterOneY;
|
| +
|
| + SkPerspIter iter(s.fInvMatrix,
|
| + SkIntToScalar(x) + SK_ScalarHalf,
|
| + SkIntToScalar(y) + SK_ScalarHalf, count);
|
| +
|
| + while ((count = iter.next()) != 0) {
|
| + const SkFixed* SK_RESTRICT srcXY = iter.getXY();
|
| +
|
| + while (count >= 4) {
|
| + int32x4_t wide_x, wide_y;
|
| + int32x4x2_t vxy, vresyx;
|
| +
|
| + // load src: x-y-x-y-x-y-x-y
|
| + vxy = vld2q_s32(srcXY);
|
| +
|
| + // do the X side, then the Y side, then interleave them
|
| + wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
|
| + wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
|
| +
|
| + vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
|
| + vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
|
| +
|
| + // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
|
| + vst2q_s32((int32_t*)xy, vresyx);
|
| +
|
| + // on to the next iteration
|
| + srcXY += 2*4;
|
| + count -= 4;
|
| + xy += 2*4;
|
| + }
|
| +
|
| + while (--count >= 0) {
|
| + // NB: we read x/y, we write y/x
|
| + *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
|
| + oneY PREAMBLE_ARG_Y);
|
| + *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
|
| + oneX PREAMBLE_ARG_X);
|
| + srcXY += 2;
|
| + }
|
| + }
|
| +}
|
| +
|
| +const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
|
| + SCALE_NOFILTER_NAME,
|
| + SCALE_FILTER_NAME,
|
| + AFFINE_NOFILTER_NAME,
|
| + AFFINE_FILTER_NAME,
|
| + PERSP_NOFILTER_NAME,
|
| + PERSP_FILTER_NAME
|
| +};
|
| +
|
| +#undef TILEX_PROCF_NEON8
|
| +#undef TILEY_PROCF_NEON8
|
| +#undef TILEX_PROCF_NEON4
|
| +#undef TILEY_PROCF_NEON4
|
| +#undef TILEX_LOW_BITS_NEON4
|
| +#undef TILEY_LOW_BITS_NEON4
|
| +
|
| +#undef MAKENAME
|
| +#undef TILEX_PROCF
|
| +#undef TILEY_PROCF
|
| +#ifdef CHECK_FOR_DECAL
|
| + #undef CHECK_FOR_DECAL
|
| +#endif
|
| +
|
| +#undef SCALE_NOFILTER_NAME
|
| +#undef SCALE_FILTER_NAME
|
| +#undef AFFINE_NOFILTER_NAME
|
| +#undef AFFINE_FILTER_NAME
|
| +#undef PERSP_NOFILTER_NAME
|
| +#undef PERSP_FILTER_NAME
|
| +
|
| +#undef PREAMBLE
|
| +#undef PREAMBLE_PARAM_X
|
| +#undef PREAMBLE_PARAM_Y
|
| +#undef PREAMBLE_ARG_X
|
| +#undef PREAMBLE_ARG_Y
|
| +
|
| +#undef TILEX_LOW_BITS
|
| +#undef TILEY_LOW_BITS
|
| +
|
|
|