Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(887)

Unified Diff: src/effects/SkBlurImageFilter.cpp

Issue 59133006: Implement the y-transpose optimization in image (RGBA) blurs. This gives ~38% performance improvem… (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Revert to older version; remove spurious whitespace. Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/effects/SkBlurImageFilter.cpp
diff --git a/src/effects/SkBlurImageFilter.cpp b/src/effects/SkBlurImageFilter.cpp
index 0fa54b555cd78cfae0b800b699a155f503bd5cd0..c04c258829ed846faeb7c1a89508e6e13fde902d 100644
--- a/src/effects/SkBlurImageFilter.cpp
+++ b/src/effects/SkBlurImageFilter.cpp
@@ -39,28 +39,55 @@ void SkBlurImageFilter::flatten(SkFlattenableWriteBuffer& buffer) const {
buffer.writeScalar(fSigma.fHeight);
}
-static void boxBlurX(const SkBitmap& src, SkBitmap* dst, int kernelSize,
- int leftOffset, int rightOffset, const SkIRect& bounds)
+enum BlurDirection {
+ kX, kY
+};
+
+/**
+ *
+ * In order to make memory accesses cache-friendly, we reorder the passes to
+ * use contiguous memory reads wherever possible.
+ *
+ * For example, the 6 passes of the X-and-Y blur case are rewritten as
+ * follows. Instead of 3 passes in X and 3 passes in Y, we perform
+ * 2 passes in X, 1 pass in X transposed to Y on write, 2 passes in X,
+ * then 1 pass in X transposed to Y on write.
+ *
+ * +----+ +----+ +----+ +---+ +---+ +---+ +----+
+ * + AB + ----> | AB | ----> | AB | -----> | A | ----> | A | ----> | A | -----> | AB |
+ * +----+ blurX +----+ blurX +----+ blurXY | B | blurX | B | blurX | B | blurXY +----+
+ * +---+ +---+ +---+
+ *
+ * In this way, two of the y-blurs become x-blurs applied to transposed
+ * images, and all memory reads are contiguous.
+ */
+
+template<BlurDirection srcDirection, BlurDirection dstDirection>
+static void boxBlur(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
+ int leftOffset, int rightOffset, int width, int height)
{
- int width = bounds.width(), height = bounds.height();
int rightBorder = SkMin32(rightOffset + 1, width);
+ int srcStrideX = srcDirection == kX ? 1 : srcStride;
+ int dstStrideX = dstDirection == kX ? 1 : height;
+ int srcStrideY = srcDirection == kX ? srcStride : 1;
+ int dstStrideY = dstDirection == kX ? width : 1;
#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
uint32_t scale = (1 << 24) / kernelSize;
uint32_t half = 1 << 23;
#endif
for (int y = 0; y < height; ++y) {
int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
- SkPMColor* p = src.getAddr32(bounds.fLeft, y + bounds.fTop);
+ const SkPMColor* p = src;
for (int i = 0; i < rightBorder; ++i) {
sumA += SkGetPackedA32(*p);
sumR += SkGetPackedR32(*p);
sumG += SkGetPackedG32(*p);
sumB += SkGetPackedB32(*p);
- p++;
+ p += srcStrideX;
}
- const SkColor* sptr = src.getAddr32(bounds.fLeft, bounds.fTop + y);
- SkColor* dptr = dst->getAddr32(0, y);
+ const SkPMColor* sptr = src;
+ SkColor* dptr = dst;
for (int x = 0; x < width; ++x) {
#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
*dptr = SkPackARGB32((sumA * scale + half) >> 24,
@@ -74,81 +101,48 @@ static void boxBlurX(const SkBitmap& src, SkBitmap* dst, int kernelSize,
sumB / kernelSize);
#endif
if (x >= leftOffset) {
- SkColor l = *(sptr - leftOffset);
+ SkColor l = *(sptr - leftOffset * srcStrideX);
sumA -= SkGetPackedA32(l);
sumR -= SkGetPackedR32(l);
sumG -= SkGetPackedG32(l);
sumB -= SkGetPackedB32(l);
}
if (x + rightOffset + 1 < width) {
- SkColor r = *(sptr + rightOffset + 1);
+ SkColor r = *(sptr + (rightOffset + 1) * srcStrideX);
sumA += SkGetPackedA32(r);
sumR += SkGetPackedR32(r);
sumG += SkGetPackedG32(r);
sumB += SkGetPackedB32(r);
}
- sptr++;
- dptr++;
+ sptr += srcStrideX;
+ if (srcDirection == kY) {
+ SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);
+ }
+ dptr += dstStrideX;
}
+ src += srcStrideY;
+ dst += dstStrideY;
}
}
-static void boxBlurY(const SkBitmap& src, SkBitmap* dst, int kernelSize,
- int topOffset, int bottomOffset, const SkIRect& bounds)
+static void boxBlurX(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
+ int leftOffset, int rightOffset, int width, int height)
{
- int width = bounds.width(), height = bounds.height();
- int bottomBorder = SkMin32(bottomOffset + 1, height);
- int srcStride = src.rowBytesAsPixels();
- int dstStride = dst->rowBytesAsPixels();
-#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
- uint32_t scale = (1 << 24) / kernelSize;
- uint32_t half = 1 << 23;
-#endif
- for (int x = 0; x < width; ++x) {
- int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
- SkColor* p = src.getAddr32(bounds.fLeft + x, bounds.fTop);
- for (int i = 0; i < bottomBorder; ++i) {
- sumA += SkGetPackedA32(*p);
- sumR += SkGetPackedR32(*p);
- sumG += SkGetPackedG32(*p);
- sumB += SkGetPackedB32(*p);
- p += srcStride;
- }
+ boxBlur<kX, kX>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, width, height);
+}
- const SkColor* sptr = src.getAddr32(bounds.fLeft + x, bounds.fTop);
- SkColor* dptr = dst->getAddr32(x, 0);
- for (int y = 0; y < height; ++y) {
#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
- *dptr = SkPackARGB32((sumA * scale + half) >> 24,
- (sumR * scale + half) >> 24,
- (sumG * scale + half) >> 24,
- (sumB * scale + half) >> 24);
-#else
- *dptr = SkPackARGB32(sumA / kernelSize,
- sumR / kernelSize,
- sumG / kernelSize,
- sumB / kernelSize);
+static void boxBlurXY(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
+ int leftOffset, int rightOffset, int width, int height)
+{
+ boxBlur<kX, kY>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, width, height);
+}
#endif
- if (y >= topOffset) {
- SkColor l = *(sptr - topOffset * srcStride);
- sumA -= SkGetPackedA32(l);
- sumR -= SkGetPackedR32(l);
- sumG -= SkGetPackedG32(l);
- sumB -= SkGetPackedB32(l);
- }
- if (y + bottomOffset + 1 < height) {
- SkColor r = *(sptr + (bottomOffset + 1) * srcStride);
- sumA += SkGetPackedA32(r);
- sumR += SkGetPackedR32(r);
- sumG += SkGetPackedG32(r);
- sumB += SkGetPackedB32(r);
- }
- sptr += srcStride;
- // The next leading pixel seems to be too hard to predict. Hint the fetch.
- SK_PREFETCH(sptr + (bottomOffset + 1) * srcStride);
- dptr += dstStride;
- }
- }
+
+static void boxBlurY(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
+ int topOffset, int bottomOffset, int width, int height)
+{
+ boxBlur<kY, kY>(src, srcStride, dst, kernelSize, topOffset, bottomOffset, width, height);
}
static void getBox3Params(SkScalar s, int *kernelSize, int* kernelSize3, int *lowOffset,
@@ -213,21 +207,35 @@ bool SkBlurImageFilter::onFilterImage(Proxy* proxy,
return false;
}
+ const SkPMColor* s = src.getAddr32(srcBounds.left(), srcBounds.top());
+ SkPMColor* t = temp.getAddr32(0, 0);
+ SkPMColor* d = dst->getAddr32(0, 0);
+ int w = dstBounds.width(), h = dstBounds.height();
+ int sw = src.rowBytesAsPixels();
if (kernelSizeX > 0 && kernelSizeY > 0) {
- boxBlurX(src, &temp, kernelSizeX, lowOffsetX, highOffsetX, srcBounds);
- boxBlurY(temp, dst, kernelSizeY, lowOffsetY, highOffsetY, dstBounds);
- boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds);
- boxBlurY(temp, dst, kernelSizeY, highOffsetY, lowOffsetY, dstBounds);
- boxBlurX(*dst, &temp, kernelSizeX3, highOffsetX, highOffsetX, dstBounds);
- boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds);
+#ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
+ boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
+ boxBlurX(t, w, d, kernelSizeX, highOffsetX, lowOffsetX, w, h);
+ boxBlurXY(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
+ boxBlurX(t, h, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
+ boxBlurX(d, h, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
+ boxBlurXY(t, h, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
+#else
+ boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
+ boxBlurY(t, w, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
+ boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
+ boxBlurY(t, w, d, kernelSizeY, highOffsetY, lowOffsetY, h, w);
+ boxBlurX(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
+ boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
+#endif
} else if (kernelSizeX > 0) {
- boxBlurX(src, dst, kernelSizeX, lowOffsetX, highOffsetX, srcBounds);
- boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds);
- boxBlurX(temp, dst, kernelSizeX3, highOffsetX, highOffsetX, dstBounds);
+ boxBlurX(s, sw, d, kernelSizeX, lowOffsetX, highOffsetX, w, h);
+ boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
+ boxBlurX(t, w, d, kernelSizeX3, highOffsetX, highOffsetX, w, h);
} else if (kernelSizeY > 0) {
- boxBlurY(src, dst, kernelSizeY, lowOffsetY, highOffsetY, srcBounds);
- boxBlurY(*dst, &temp, kernelSizeY, highOffsetY, lowOffsetY, dstBounds);
- boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds);
+ boxBlurY(s, sw, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
+ boxBlurY(d, w, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
+ boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
}
offset->fX += srcBounds.fLeft;
offset->fY += srcBounds.fTop;
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698