| Index: src/effects/SkBlurImageFilter.cpp
|
| diff --git a/src/effects/SkBlurImageFilter.cpp b/src/effects/SkBlurImageFilter.cpp
|
| index 0fa54b555cd78cfae0b800b699a155f503bd5cd0..c04c258829ed846faeb7c1a89508e6e13fde902d 100644
|
| --- a/src/effects/SkBlurImageFilter.cpp
|
| +++ b/src/effects/SkBlurImageFilter.cpp
|
| @@ -39,28 +39,55 @@ void SkBlurImageFilter::flatten(SkFlattenableWriteBuffer& buffer) const {
|
| buffer.writeScalar(fSigma.fHeight);
|
| }
|
|
|
| -static void boxBlurX(const SkBitmap& src, SkBitmap* dst, int kernelSize,
|
| - int leftOffset, int rightOffset, const SkIRect& bounds)
|
| +enum BlurDirection {
|
| + kX, kY
|
| +};
|
| +
|
| +/**
|
| + *
|
| + * In order to make memory accesses cache-friendly, we reorder the passes to
|
| + * use contiguous memory reads wherever possible.
|
| + *
|
| + * For example, the 6 passes of the X-and-Y blur case are rewritten as
|
| + * follows. Instead of 3 passes in X and 3 passes in Y, we perform
|
| + * 2 passes in X, 1 pass in X transposed to Y on write, 2 passes in X,
|
| + * then 1 pass in X transposed to Y on write.
|
| + *
|
| + * +----+ +----+ +----+ +---+ +---+ +---+ +----+
|
| + * + AB + ----> | AB | ----> | AB | -----> | A | ----> | A | ----> | A | -----> | AB |
|
| + * +----+ blurX +----+ blurX +----+ blurXY | B | blurX | B | blurX | B | blurXY +----+
|
| + * +---+ +---+ +---+
|
| + *
|
| + * In this way, two of the y-blurs become x-blurs applied to transposed
|
| + * images, and all memory reads are contiguous.
|
| + */
|
| +
|
| +template<BlurDirection srcDirection, BlurDirection dstDirection>
|
| +static void boxBlur(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
|
| + int leftOffset, int rightOffset, int width, int height)
|
| {
|
| - int width = bounds.width(), height = bounds.height();
|
| int rightBorder = SkMin32(rightOffset + 1, width);
|
| + int srcStrideX = srcDirection == kX ? 1 : srcStride;
|
| + int dstStrideX = dstDirection == kX ? 1 : height;
|
| + int srcStrideY = srcDirection == kX ? srcStride : 1;
|
| + int dstStrideY = dstDirection == kX ? width : 1;
|
| #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
|
| uint32_t scale = (1 << 24) / kernelSize;
|
| uint32_t half = 1 << 23;
|
| #endif
|
| for (int y = 0; y < height; ++y) {
|
| int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
|
| - SkPMColor* p = src.getAddr32(bounds.fLeft, y + bounds.fTop);
|
| + const SkPMColor* p = src;
|
| for (int i = 0; i < rightBorder; ++i) {
|
| sumA += SkGetPackedA32(*p);
|
| sumR += SkGetPackedR32(*p);
|
| sumG += SkGetPackedG32(*p);
|
| sumB += SkGetPackedB32(*p);
|
| - p++;
|
| + p += srcStrideX;
|
| }
|
|
|
| - const SkColor* sptr = src.getAddr32(bounds.fLeft, bounds.fTop + y);
|
| - SkColor* dptr = dst->getAddr32(0, y);
|
| + const SkPMColor* sptr = src;
|
| + SkColor* dptr = dst;
|
| for (int x = 0; x < width; ++x) {
|
| #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
|
| *dptr = SkPackARGB32((sumA * scale + half) >> 24,
|
| @@ -74,81 +101,48 @@ static void boxBlurX(const SkBitmap& src, SkBitmap* dst, int kernelSize,
|
| sumB / kernelSize);
|
| #endif
|
| if (x >= leftOffset) {
|
| - SkColor l = *(sptr - leftOffset);
|
| + SkColor l = *(sptr - leftOffset * srcStrideX);
|
| sumA -= SkGetPackedA32(l);
|
| sumR -= SkGetPackedR32(l);
|
| sumG -= SkGetPackedG32(l);
|
| sumB -= SkGetPackedB32(l);
|
| }
|
| if (x + rightOffset + 1 < width) {
|
| - SkColor r = *(sptr + rightOffset + 1);
|
| + SkColor r = *(sptr + (rightOffset + 1) * srcStrideX);
|
| sumA += SkGetPackedA32(r);
|
| sumR += SkGetPackedR32(r);
|
| sumG += SkGetPackedG32(r);
|
| sumB += SkGetPackedB32(r);
|
| }
|
| - sptr++;
|
| - dptr++;
|
| + sptr += srcStrideX;
|
| + if (srcDirection == kY) {
|
| + SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);
|
| + }
|
| + dptr += dstStrideX;
|
| }
|
| + src += srcStrideY;
|
| + dst += dstStrideY;
|
| }
|
| }
|
|
|
| -static void boxBlurY(const SkBitmap& src, SkBitmap* dst, int kernelSize,
|
| - int topOffset, int bottomOffset, const SkIRect& bounds)
|
| +static void boxBlurX(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
|
| + int leftOffset, int rightOffset, int width, int height)
|
| {
|
| - int width = bounds.width(), height = bounds.height();
|
| - int bottomBorder = SkMin32(bottomOffset + 1, height);
|
| - int srcStride = src.rowBytesAsPixels();
|
| - int dstStride = dst->rowBytesAsPixels();
|
| -#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
|
| - uint32_t scale = (1 << 24) / kernelSize;
|
| - uint32_t half = 1 << 23;
|
| -#endif
|
| - for (int x = 0; x < width; ++x) {
|
| - int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
|
| - SkColor* p = src.getAddr32(bounds.fLeft + x, bounds.fTop);
|
| - for (int i = 0; i < bottomBorder; ++i) {
|
| - sumA += SkGetPackedA32(*p);
|
| - sumR += SkGetPackedR32(*p);
|
| - sumG += SkGetPackedG32(*p);
|
| - sumB += SkGetPackedB32(*p);
|
| - p += srcStride;
|
| - }
|
| + boxBlur<kX, kX>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, width, height);
|
| +}
|
|
|
| - const SkColor* sptr = src.getAddr32(bounds.fLeft + x, bounds.fTop);
|
| - SkColor* dptr = dst->getAddr32(x, 0);
|
| - for (int y = 0; y < height; ++y) {
|
| #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
|
| - *dptr = SkPackARGB32((sumA * scale + half) >> 24,
|
| - (sumR * scale + half) >> 24,
|
| - (sumG * scale + half) >> 24,
|
| - (sumB * scale + half) >> 24);
|
| -#else
|
| - *dptr = SkPackARGB32(sumA / kernelSize,
|
| - sumR / kernelSize,
|
| - sumG / kernelSize,
|
| - sumB / kernelSize);
|
| +static void boxBlurXY(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
|
| + int leftOffset, int rightOffset, int width, int height)
|
| +{
|
| + boxBlur<kX, kY>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, width, height);
|
| +}
|
| #endif
|
| - if (y >= topOffset) {
|
| - SkColor l = *(sptr - topOffset * srcStride);
|
| - sumA -= SkGetPackedA32(l);
|
| - sumR -= SkGetPackedR32(l);
|
| - sumG -= SkGetPackedG32(l);
|
| - sumB -= SkGetPackedB32(l);
|
| - }
|
| - if (y + bottomOffset + 1 < height) {
|
| - SkColor r = *(sptr + (bottomOffset + 1) * srcStride);
|
| - sumA += SkGetPackedA32(r);
|
| - sumR += SkGetPackedR32(r);
|
| - sumG += SkGetPackedG32(r);
|
| - sumB += SkGetPackedB32(r);
|
| - }
|
| - sptr += srcStride;
|
| - // The next leading pixel seems to be too hard to predict. Hint the fetch.
|
| - SK_PREFETCH(sptr + (bottomOffset + 1) * srcStride);
|
| - dptr += dstStride;
|
| - }
|
| - }
|
| +
|
| +static void boxBlurY(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
|
| + int topOffset, int bottomOffset, int width, int height)
|
| +{
|
| + boxBlur<kY, kY>(src, srcStride, dst, kernelSize, topOffset, bottomOffset, width, height);
|
| }
|
|
|
| static void getBox3Params(SkScalar s, int *kernelSize, int* kernelSize3, int *lowOffset,
|
| @@ -213,21 +207,35 @@ bool SkBlurImageFilter::onFilterImage(Proxy* proxy,
|
| return false;
|
| }
|
|
|
| + const SkPMColor* s = src.getAddr32(srcBounds.left(), srcBounds.top());
|
| + SkPMColor* t = temp.getAddr32(0, 0);
|
| + SkPMColor* d = dst->getAddr32(0, 0);
|
| + int w = dstBounds.width(), h = dstBounds.height();
|
| + int sw = src.rowBytesAsPixels();
|
| if (kernelSizeX > 0 && kernelSizeY > 0) {
|
| - boxBlurX(src, &temp, kernelSizeX, lowOffsetX, highOffsetX, srcBounds);
|
| - boxBlurY(temp, dst, kernelSizeY, lowOffsetY, highOffsetY, dstBounds);
|
| - boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds);
|
| - boxBlurY(temp, dst, kernelSizeY, highOffsetY, lowOffsetY, dstBounds);
|
| - boxBlurX(*dst, &temp, kernelSizeX3, highOffsetX, highOffsetX, dstBounds);
|
| - boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds);
|
| +#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
|
| + boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
|
| + boxBlurX(t, w, d, kernelSizeX, highOffsetX, lowOffsetX, w, h);
|
| + boxBlurXY(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
|
| + boxBlurX(t, h, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
|
| + boxBlurX(d, h, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
|
| + boxBlurXY(t, h, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
|
| +#else
|
| + boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
|
| + boxBlurY(t, w, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
|
| + boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
|
| + boxBlurY(t, w, d, kernelSizeY, highOffsetY, lowOffsetY, h, w);
|
| + boxBlurX(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
|
| + boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
|
| +#endif
|
| } else if (kernelSizeX > 0) {
|
| - boxBlurX(src, dst, kernelSizeX, lowOffsetX, highOffsetX, srcBounds);
|
| - boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds);
|
| - boxBlurX(temp, dst, kernelSizeX3, highOffsetX, highOffsetX, dstBounds);
|
| + boxBlurX(s, sw, d, kernelSizeX, lowOffsetX, highOffsetX, w, h);
|
| + boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
|
| + boxBlurX(t, w, d, kernelSizeX3, highOffsetX, highOffsetX, w, h);
|
| } else if (kernelSizeY > 0) {
|
| - boxBlurY(src, dst, kernelSizeY, lowOffsetY, highOffsetY, srcBounds);
|
| - boxBlurY(*dst, &temp, kernelSizeY, highOffsetY, lowOffsetY, dstBounds);
|
| - boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds);
|
| + boxBlurY(s, sw, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
|
| + boxBlurY(d, w, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
|
| + boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
|
| }
|
| offset->fX += srcBounds.fLeft;
|
| offset->fY += srcBounds.fTop;
|
|
|