Index: src/opts/SkBitmapProcState_matrix_neon.h |
diff --git a/src/opts/SkBitmapProcState_matrix_neon.h b/src/opts/SkBitmapProcState_matrix_neon.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..bc1b59f204be06f319a5b93a09c9a17959710864 |
--- /dev/null |
+++ b/src/opts/SkBitmapProcState_matrix_neon.h |
@@ -0,0 +1,504 @@ |
+ |
+#include <arm_neon.h> |
+ |
+ |
+#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) |
+#define SCALE_FILTER_NAME MAKENAME(_filter_scale) |
+#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) |
+#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) |
+#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) |
+#define PERSP_FILTER_NAME MAKENAME(_filter_persp) |
+ |
+#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) |
+#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) |
+#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) |
+#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) |
+ |
+#ifndef PREAMBLE |
+ #define PREAMBLE(state) |
+ #define PREAMBLE_PARAM_X |
+ #define PREAMBLE_PARAM_Y |
+ #define PREAMBLE_ARG_X |
+ #define PREAMBLE_ARG_Y |
+#endif |
+ |
+static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t xy[], int count, int x, int y) { |
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
+ SkMatrix::kScale_Mask)) == 0); |
+ |
+ PREAMBLE(s); |
+ |
+ // we store y, x, x, x, x, x |
+ const unsigned maxX = s.fBitmap->width() - 1; |
+ SkFixed fx; |
+ { |
+ SkPoint pt; |
+ s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, &pt); |
+ fx = SkScalarToFixed(pt.fY); |
+ const unsigned maxY = s.fBitmap->height() - 1; |
+ *xy++ = TILEY_PROCF(fx, maxY); |
+ fx = SkScalarToFixed(pt.fX); |
+ } |
+ |
+ if (0 == maxX) { |
+ // all of the following X values must be 0 |
+ memset(xy, 0, count * sizeof(uint16_t)); |
+ return; |
+ } |
+ |
+ const SkFixed dx = s.fInvSx; |
+ |
+#ifdef CHECK_FOR_DECAL |
+ // test if we don't need to apply the tile proc |
+ if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { |
+ decal_nofilter_scale_neon(xy, fx, dx, count); |
+ return; |
+ } |
+#endif |
+ |
+ if (count >= 8) { |
+ SkFixed dx2 = dx+dx; |
+ SkFixed dx4 = dx2+dx2; |
+ SkFixed dx8 = dx4+dx4; |
+ |
+ // now build fx/fx+dx/fx+2dx/fx+3dx |
+ SkFixed fx1, fx2, fx3; |
+ int32x4_t lbase, hbase; |
+ int16_t *dst16 = (int16_t *)xy; |
+ |
+ fx1 = fx+dx; |
+ fx2 = fx1+dx; |
+ fx3 = fx2+dx; |
+ |
+ lbase = vdupq_n_s32(fx); |
+ lbase = vsetq_lane_s32(fx1, lbase, 1); |
+ lbase = vsetq_lane_s32(fx2, lbase, 2); |
+ lbase = vsetq_lane_s32(fx3, lbase, 3); |
+ hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); |
+ |
+ // store & bump |
+ while (count >= 8) { |
+ |
+ int16x8_t fx8; |
+ |
+ fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); |
+ |
+ vst1q_s16(dst16, fx8); |
+ |
+ // but preserving base & on to the next |
+ lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); |
+ hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); |
+ dst16 += 8; |
+ count -= 8; |
+ fx += dx8; |
+ }; |
+ xy = (uint32_t *) dst16; |
+ } |
+ |
+ uint16_t* xx = (uint16_t*)xy; |
+ for (int i = count; i > 0; --i) { |
+ *xx++ = TILEX_PROCF(fx, maxX); |
+ fx += dx; |
+ } |
+} |
+ |
+static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t xy[], int count, int x, int y) { |
+ SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); |
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
+ SkMatrix::kScale_Mask | |
+ SkMatrix::kAffine_Mask)) == 0); |
+ |
+ PREAMBLE(s); |
+ SkPoint srcPt; |
+ s.fInvProc(s.fInvMatrix, |
+ SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, &srcPt); |
+ |
+ SkFixed fx = SkScalarToFixed(srcPt.fX); |
+ SkFixed fy = SkScalarToFixed(srcPt.fY); |
+ SkFixed dx = s.fInvSx; |
+ SkFixed dy = s.fInvKy; |
+ int maxX = s.fBitmap->width() - 1; |
+ int maxY = s.fBitmap->height() - 1; |
+ |
+ if (count >= 8) { |
+ SkFixed dx4 = dx * 4; |
+ SkFixed dy4 = dy * 4; |
+ SkFixed dx8 = dx * 8; |
+ SkFixed dy8 = dy * 8; |
+ |
+ int32x4_t xbase, ybase; |
+ int32x4_t x2base, y2base; |
+ int16_t *dst16 = (int16_t *) xy; |
+ |
+ // now build fx, fx+dx, fx+2dx, fx+3dx |
+ xbase = vdupq_n_s32(fx); |
+ xbase = vsetq_lane_s32(fx+dx, xbase, 1); |
+ xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); |
+ xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); |
+ |
+ // same for fy |
+ ybase = vdupq_n_s32(fy); |
+ ybase = vsetq_lane_s32(fy+dy, ybase, 1); |
+ ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); |
+ ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); |
+ |
+ x2base = vaddq_s32(xbase, vdupq_n_s32(dx4)); |
+ y2base = vaddq_s32(ybase, vdupq_n_s32(dy4)); |
+ |
+ // store & bump |
+ do { |
+ int16x8x2_t hi16; |
+ |
+ hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); |
+ hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); |
+ |
+ vst2q_s16(dst16, hi16); |
+ |
+ // moving base and on to the next |
+ xbase = vaddq_s32(xbase, vdupq_n_s32(dx8)); |
+ ybase = vaddq_s32(ybase, vdupq_n_s32(dy8)); |
+ x2base = vaddq_s32(x2base, vdupq_n_s32(dx8)); |
+ y2base = vaddq_s32(y2base, vdupq_n_s32(dy8)); |
+ |
+ dst16 += 16; // 8x32 aka 16x16 |
+ count -= 8; |
+ fx += dx8; |
+ fy += dy8; |
+ } while (count >= 8); |
+ xy = (uint32_t *) dst16; |
+ } |
+ |
+ for (int i = count; i > 0; --i) { |
+ *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); |
+ fx += dx; fy += dy; |
+ } |
+} |
+ |
+static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t* SK_RESTRICT xy, |
+ int count, int x, int y) { |
+ SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); |
+ |
+ PREAMBLE(s); |
+ // max{X,Y} are int here, but later shown/assumed to fit in 16 bits |
+ int maxX = s.fBitmap->width() - 1; |
+ int maxY = s.fBitmap->height() - 1; |
+ |
+ SkPerspIter iter(s.fInvMatrix, |
+ SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, count); |
+ |
+ while ((count = iter.next()) != 0) { |
+ const SkFixed* SK_RESTRICT srcXY = iter.getXY(); |
+ |
+ if (count >= 8) { |
+ int32_t *mysrc = (int32_t *) srcXY; |
+ int16_t *mydst = (int16_t *) xy; |
+ do { |
+ int16x8x2_t hi16; |
+ int32x4x2_t xy1, xy2; |
+ |
+ xy1 = vld2q_s32(mysrc); |
+ xy2 = vld2q_s32(mysrc+8); |
+ |
+ hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); |
+ hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); |
+ |
+ vst2q_s16(mydst, hi16); |
+ |
+ count -= 8; // 8 iterations |
+ mysrc += 16; // 16 longs |
+ mydst += 16; // 16 shorts, aka 8 longs |
+ } while (count >= 8); |
+ // get xy and srcXY fixed up |
+ srcXY = (const SkFixed *) mysrc; |
+ xy = (uint32_t *) mydst; |
+ } |
+ |
+ while (--count >= 0) { |
+ *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | |
+ TILEX_PROCF(srcXY[0], maxX); |
+ srcXY += 2; |
+ } |
+ } |
+} |
+ |
+static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, |
+ SkFixed one PREAMBLE_PARAM_Y) { |
+ unsigned i = TILEY_PROCF(f, max); |
+ i = (i << 4) | TILEY_LOW_BITS(f, max); |
+ return (i << 14) | (TILEY_PROCF((f + one), max)); |
+} |
+ |
+static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, |
+ SkFixed one PREAMBLE_PARAM_X) { |
+ unsigned i = TILEX_PROCF(f, max); |
+ i = (i << 4) | TILEX_LOW_BITS(f, max); |
+ return (i << 14) | (TILEX_PROCF((f + one), max)); |
+} |
+ |
+static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, |
+ SkFixed one PREAMBLE_PARAM_X) { |
+ int32x4_t ret, res, wide_one; |
+ |
+ // Prepare constants |
+ wide_one = vdupq_n_s32(one); |
+ |
+ // Step 1 |
+ res = TILEX_PROCF_NEON4(f, max); |
+ |
+ // Step 2 |
+ ret = TILEX_LOW_BITS_NEON4(f, max); |
+ ret = vsliq_n_s32(ret, res, 4); |
+ |
+ // Step 3 |
+ res = TILEX_PROCF_NEON4(f + wide_one, max); |
+ ret = vorrq_s32(vshlq_n_s32(ret, 14), res); |
+ |
+ return ret; |
+} |
+ |
+static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, |
+ SkFixed one PREAMBLE_PARAM_X) { |
+ int32x4_t ret, res, wide_one; |
+ |
+ // Prepare constants |
+ wide_one = vdupq_n_s32(one); |
+ |
+ // Step 1 |
+ res = TILEY_PROCF_NEON4(f, max); |
+ |
+ // Step 2 |
+ ret = TILEY_LOW_BITS_NEON4(f, max); |
+ ret = vsliq_n_s32(ret, res, 4); |
+ |
+ // Step 3 |
+ res = TILEY_PROCF_NEON4(f + wide_one, max); |
+ ret = vorrq_s32(vshlq_n_s32(ret, 14), res); |
+ |
+ return ret; |
+} |
+ |
+static void SCALE_FILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t xy[], int count, int x, int y) { |
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
+ SkMatrix::kScale_Mask)) == 0); |
+ SkASSERT(s.fInvKy == 0); |
+ |
+ PREAMBLE(s); |
+ |
+ const unsigned maxX = s.fBitmap->width() - 1; |
+ const SkFixed one = s.fFilterOneX; |
+ const SkFixed dx = s.fInvSx; |
+ SkFixed fx; |
+ |
+ { |
+ SkPoint pt; |
+ s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, &pt); |
+ const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); |
+ const unsigned maxY = s.fBitmap->height() - 1; |
+ // compute our two Y values up front |
+ *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); |
+ // now initialize fx |
+ fx = SkScalarToFixed(pt.fX) - (one >> 1); |
+ } |
+ |
+#ifdef CHECK_FOR_DECAL |
+ // test if we don't need to apply the tile proc |
+ if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { |
+ decal_filter_scale_neon(xy, fx, dx, count); |
+ return; |
+ } |
+#endif |
+ { |
+ |
+ if (count >= 4) { |
+ int32x4_t wide_fx; |
+ |
+ wide_fx = vdupq_n_s32(fx); |
+ wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); |
+ wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); |
+ wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); |
+ |
+ while (count >= 4) { |
+ int32x4_t res; |
+ |
+ res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); |
+ |
+ vst1q_u32(xy, vreinterpretq_u32_s32(res)); |
+ |
+ wide_fx += vdupq_n_s32(dx+dx+dx+dx); |
+ fx += dx+dx+dx+dx; |
+ xy += 4; |
+ count -= 4; |
+ } |
+ } |
+ |
+ while (--count >= 0) { |
+ *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X); |
+ fx += dx; |
+ } |
+ |
+ } |
+} |
+ |
+static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t xy[], int count, int x, int y) { |
+ SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); |
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
+ SkMatrix::kScale_Mask | |
+ SkMatrix::kAffine_Mask)) == 0); |
+ |
+ PREAMBLE(s); |
+ SkPoint srcPt; |
+ s.fInvProc(s.fInvMatrix, |
+ SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, &srcPt); |
+ |
+ SkFixed oneX = s.fFilterOneX; |
+ SkFixed oneY = s.fFilterOneY; |
+ SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); |
+ SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); |
+ SkFixed dx = s.fInvSx; |
+ SkFixed dy = s.fInvKy; |
+ unsigned maxX = s.fBitmap->width() - 1; |
+ unsigned maxY = s.fBitmap->height() - 1; |
+ |
+ if (count >= 4) { |
+ int32x4_t wide_fy, wide_fx; |
+ |
+ wide_fx = vdupq_n_s32(fx); |
+ wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); |
+ wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); |
+ wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); |
+ |
+ wide_fy = vdupq_n_s32(fy); |
+ wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); |
+ wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); |
+ wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); |
+ |
+ while (count >= 4) { |
+ int32x4x2_t vxy; |
+ |
+ // do the X side, then the Y side, then interleave them |
+ vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); |
+ vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); |
+ |
+ // interleave as YXYXYXYX as part of the storing |
+ vst2q_s32((int32_t*)xy, vxy); |
+ |
+ // prepare next iteration |
+ wide_fx += vdupq_n_s32(dx+dx+dx+dx); |
+ fx += dx + dx + dx + dx; |
+ wide_fy += vdupq_n_s32(dy+dy+dy+dy); |
+ fy += dy+dy+dy+dy; |
+ xy += 8; // 4 x's, 4 y's |
+ count -= 4; |
+ } |
+ } |
+ |
+ while (--count >= 0) { |
+ // NB: writing Y/X |
+ *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); |
+ fy += dy; |
+ *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); |
+ fx += dx; |
+ } |
+} |
+ |
+static void PERSP_FILTER_NAME(const SkBitmapProcState& s, |
+ uint32_t* SK_RESTRICT xy, int count, |
+ int x, int y) { |
+ SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); |
+ |
+ PREAMBLE(s); |
+ unsigned maxX = s.fBitmap->width() - 1; |
+ unsigned maxY = s.fBitmap->height() - 1; |
+ SkFixed oneX = s.fFilterOneX; |
+ SkFixed oneY = s.fFilterOneY; |
+ |
+ SkPerspIter iter(s.fInvMatrix, |
+ SkIntToScalar(x) + SK_ScalarHalf, |
+ SkIntToScalar(y) + SK_ScalarHalf, count); |
+ |
+ while ((count = iter.next()) != 0) { |
+ const SkFixed* SK_RESTRICT srcXY = iter.getXY(); |
+ |
+ while (count >= 4) { |
+ int32x4_t wide_x, wide_y; |
+ int32x4x2_t vxy, vresyx; |
+ |
+ // load src: x-y-x-y-x-y-x-y |
+ vxy = vld2q_s32(srcXY); |
+ |
+ // do the X side, then the Y side, then interleave them |
+ wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); |
+ wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); |
+ |
+ vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); |
+ vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); |
+ |
+ // store interleaved as y-x-y-x-y-x-y-x (NB != read order) |
+ vst2q_s32((int32_t*)xy, vresyx); |
+ |
+ // on to the next iteration |
+ srcXY += 2*4; |
+ count -= 4; |
+ xy += 2*4; |
+ } |
+ |
+ while (--count >= 0) { |
+ // NB: we read x/y, we write y/x |
+ *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, |
+ oneY PREAMBLE_ARG_Y); |
+ *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, |
+ oneX PREAMBLE_ARG_X); |
+ srcXY += 2; |
+ } |
+ } |
+} |
+ |
+const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { |
+ SCALE_NOFILTER_NAME, |
+ SCALE_FILTER_NAME, |
+ AFFINE_NOFILTER_NAME, |
+ AFFINE_FILTER_NAME, |
+ PERSP_NOFILTER_NAME, |
+ PERSP_FILTER_NAME |
+}; |
+ |
+#undef TILEX_PROCF_NEON8 |
+#undef TILEY_PROCF_NEON8 |
+#undef TILEX_PROCF_NEON4 |
+#undef TILEY_PROCF_NEON4 |
+#undef TILEX_LOW_BITS_NEON4 |
+#undef TILEY_LOW_BITS_NEON4 |
+ |
+#undef MAKENAME |
+#undef TILEX_PROCF |
+#undef TILEY_PROCF |
+#ifdef CHECK_FOR_DECAL |
+ #undef CHECK_FOR_DECAL |
+#endif |
+ |
+#undef SCALE_NOFILTER_NAME |
+#undef SCALE_FILTER_NAME |
+#undef AFFINE_NOFILTER_NAME |
+#undef AFFINE_FILTER_NAME |
+#undef PERSP_NOFILTER_NAME |
+#undef PERSP_FILTER_NAME |
+ |
+#undef PREAMBLE |
+#undef PREAMBLE_PARAM_X |
+#undef PREAMBLE_PARAM_Y |
+#undef PREAMBLE_ARG_X |
+#undef PREAMBLE_ARG_Y |
+ |
+#undef TILEX_LOW_BITS |
+#undef TILEY_LOW_BITS |
+ |