Index: src/opts/SkBlurImageFilter_opts.h |
diff --git a/src/opts/SkBlurImageFilter_opts.h b/src/opts/SkBlurImageFilter_opts.h |
index 94d734627ba64848601ee3e3aa88d7c338a7e54a..720130b1ecf032f303ee50bc67f3ed7ef365aa4a 100644 |
--- a/src/opts/SkBlurImageFilter_opts.h |
+++ b/src/opts/SkBlurImageFilter_opts.h |
@@ -67,6 +67,21 @@ static inline __m128i mullo_epi32(__m128i a, __m128i b) { |
#elif defined(SK_ARM_HAS_NEON) |
+// val = (sum * scale * 2 + 0x8000) >> 16 |
+#define STORE_SUMS_DOUBLE \ |
mtklein
2015/10/28 22:31:17
This just symmetry or are you thinking of expandin
Stephen White
2015/10/28 22:38:33
Well, mostly because it's repeated four times belo
|
+ uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( \ |
+ vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); \ |
+ if (dstDirection == BlurDirection::kX) { \ |
+ uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); \ |
+ vst1_lane_u32(dptr + 0, px2, 0); \ |
+ vst1_lane_u32(dptr + width, px2, 1); \ |
+ } else { \ |
+ vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ |
+ } |
+ |
+#define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) |
+#define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) |
+ |
// Fast path for kernel sizes between 2 and 127, working on two rows at a time. |
template<BlurDirection srcDirection, BlurDirection dstDirection> |
void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize, |
@@ -84,7 +99,9 @@ void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int |
return vld1_u8((uint8_t*)s); |
} |
}; |
- const int rightBorder = SkMin32(rightOffset + 1, width); |
+ int incrementStart = SkMax32(-rightOffset - 1, -width); |
+ int incrementEnd = SkMax32(width - rightOffset - 1, 0); |
+ int decrementStart = SkMin32(leftOffset, width); |
const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : *height; |
const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
@@ -93,34 +110,37 @@ void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int |
for (; *height >= 2; *height -= 2) { |
uint16x8_t sum = vdupq_n_u16(0); |
- const SkPMColor* p = *src; |
- for (int i = 0; i < rightBorder; i++) { |
- sum = vaddw_u8(sum, load_2_pixels(p)); |
- p += srcStrideX; |
- } |
- |
- const SkPMColor* sptr = *src; |
+ const SkPMColor* lptr = *src; |
+ const SkPMColor* rptr = *src; |
SkPMColor* dptr = *dst; |
- for (int x = 0; x < width; x++) { |
- // val = (sum * scale * 2 + 0x8000) >> 16 |
- uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( |
- vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); |
- if (dstDirection == BlurDirection::kX) { |
- uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); |
- vst1_lane_u32(dptr + 0, px2, 0); |
- vst1_lane_u32(dptr + width, px2, 1); |
- } else { |
- vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); |
- } |
- |
- if (x >= leftOffset) { |
- sum = vsubw_u8(sum, load_2_pixels(sptr - leftOffset * srcStrideX)); |
- } |
- if (x + rightOffset + 1 < width) { |
- sum = vaddw_u8(sum, load_2_pixels(sptr + (rightOffset + 1) * srcStrideX)); |
- } |
- sptr += srcStrideX; |
+ int x; |
+ for (x = incrementStart; x < 0; ++x) { |
+ INCREMENT_SUMS_DOUBLE(rptr); |
+ rptr += srcStrideX; |
+ } |
+ for (; x < decrementStart && x < incrementEnd; ++x) { |
+ STORE_SUMS_DOUBLE |
dptr += dstStrideX; |
+ INCREMENT_SUMS_DOUBLE(rptr); |
+ rptr += srcStrideX; |
+ } |
+ for (x = decrementStart; x < incrementEnd; ++x) { |
+ STORE_SUMS_DOUBLE |
+ dptr += dstStrideX; |
+ INCREMENT_SUMS_DOUBLE(rptr); |
+ rptr += srcStrideX; |
+ DECREMENT_SUMS_DOUBLE(lptr); |
+ lptr += srcStrideX; |
+ } |
+ for (x = incrementEnd; x < decrementStart; ++x) { |
+ STORE_SUMS_DOUBLE |
+ dptr += dstStrideX; |
+ } |
+ for (; x < width; ++x) { |
+ STORE_SUMS_DOUBLE |
+ dptr += dstStrideX; |
+ DECREMENT_SUMS_DOUBLE(lptr); |
+ lptr += srcStrideX; |
} |
*src += srcStrideY * 2; |
*dst += dstStrideY * 2; |