Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(25)

Side by Side Diff: src/effects/SkBlurImageFilter.cpp

Issue 59133006: Implement the y-transpose optimization in image (RGBA) blurs. This gives ~38% performance improvem… (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Revert to older version; remove spurious whitespace. Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The Android Open Source Project 2 * Copyright 2011 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmap.h" 8 #include "SkBitmap.h"
9 #include "SkBlurImageFilter.h" 9 #include "SkBlurImageFilter.h"
10 #include "SkColorPriv.h" 10 #include "SkColorPriv.h"
(...skipping 21 matching lines...) Expand all
32 : INHERITED(input, cropRect), fSigma(SkSize::Make(sigmaX, sigmaY)) { 32 : INHERITED(input, cropRect), fSigma(SkSize::Make(sigmaX, sigmaY)) {
33 SkASSERT(sigmaX >= 0 && sigmaY >= 0); 33 SkASSERT(sigmaX >= 0 && sigmaY >= 0);
34 } 34 }
35 35
36 void SkBlurImageFilter::flatten(SkFlattenableWriteBuffer& buffer) const { 36 void SkBlurImageFilter::flatten(SkFlattenableWriteBuffer& buffer) const {
37 this->INHERITED::flatten(buffer); 37 this->INHERITED::flatten(buffer);
38 buffer.writeScalar(fSigma.fWidth); 38 buffer.writeScalar(fSigma.fWidth);
39 buffer.writeScalar(fSigma.fHeight); 39 buffer.writeScalar(fSigma.fHeight);
40 } 40 }
41 41
42 static void boxBlurX(const SkBitmap& src, SkBitmap* dst, int kernelSize, 42 enum BlurDirection {
43 int leftOffset, int rightOffset, const SkIRect& bounds) 43 kX, kY
44 };
45
46 /**
47 *
48 * In order to make memory accesses cache-friendly, we reorder the passes to
49 * use contiguous memory reads wherever possible.
50 *
51 * For example, the 6 passes of the X-and-Y blur case are rewritten as
52 * follows. Instead of 3 passes in X and 3 passes in Y, we perform
53 * 2 passes in X, 1 pass in X transposed to Y on write, 2 passes in X,
54 * then 1 pass in X transposed to Y on write.
55 *
56 * +----+ +----+ +----+ +---+ +---+ +---+ +----+
57 * + AB + ----> | AB | ----> | AB | -----> | A | ----> | A | ----> | A | -----> | AB |
58 * +----+ blurX +----+ blurX +----+ blurXY | B | blurX | B | blurX | B | blurXY +----+
59 * +---+ +---+ +---+
60 *
61 * In this way, two of the y-blurs become x-blurs applied to transposed
62 * images, and all memory reads are contiguous.
63 */
64
65 template<BlurDirection srcDirection, BlurDirection dstDirection>
66 static void boxBlur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,
67 int leftOffset, int rightOffset, int width, int height)
44 { 68 {
45 int width = bounds.width(), height = bounds.height();
46 int rightBorder = SkMin32(rightOffset + 1, width); 69 int rightBorder = SkMin32(rightOffset + 1, width);
70 int srcStrideX = srcDirection == kX ? 1 : srcStride;
71 int dstStrideX = dstDirection == kX ? 1 : height;
72 int srcStrideY = srcDirection == kX ? srcStride : 1;
73 int dstStrideY = dstDirection == kX ? width : 1;
47 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION 74 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
48 uint32_t scale = (1 << 24) / kernelSize; 75 uint32_t scale = (1 << 24) / kernelSize;
49 uint32_t half = 1 << 23; 76 uint32_t half = 1 << 23;
50 #endif 77 #endif
51 for (int y = 0; y < height; ++y) { 78 for (int y = 0; y < height; ++y) {
52 int sumA = 0, sumR = 0, sumG = 0, sumB = 0; 79 int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
53 SkPMColor* p = src.getAddr32(bounds.fLeft, y + bounds.fTop); 80 const SkPMColor* p = src;
54 for (int i = 0; i < rightBorder; ++i) { 81 for (int i = 0; i < rightBorder; ++i) {
55 sumA += SkGetPackedA32(*p); 82 sumA += SkGetPackedA32(*p);
56 sumR += SkGetPackedR32(*p); 83 sumR += SkGetPackedR32(*p);
57 sumG += SkGetPackedG32(*p); 84 sumG += SkGetPackedG32(*p);
58 sumB += SkGetPackedB32(*p); 85 sumB += SkGetPackedB32(*p);
59 p++; 86 p += srcStrideX;
60 } 87 }
61 88
62 const SkColor* sptr = src.getAddr32(bounds.fLeft, bounds.fTop + y); 89 const SkPMColor* sptr = src;
63 SkColor* dptr = dst->getAddr32(0, y); 90 SkColor* dptr = dst;
64 for (int x = 0; x < width; ++x) { 91 for (int x = 0; x < width; ++x) {
65 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION 92 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
66 *dptr = SkPackARGB32((sumA * scale + half) >> 24, 93 *dptr = SkPackARGB32((sumA * scale + half) >> 24,
67 (sumR * scale + half) >> 24, 94 (sumR * scale + half) >> 24,
68 (sumG * scale + half) >> 24, 95 (sumG * scale + half) >> 24,
69 (sumB * scale + half) >> 24); 96 (sumB * scale + half) >> 24);
70 #else 97 #else
71 *dptr = SkPackARGB32(sumA / kernelSize, 98 *dptr = SkPackARGB32(sumA / kernelSize,
72 sumR / kernelSize, 99 sumR / kernelSize,
73 sumG / kernelSize, 100 sumG / kernelSize,
74 sumB / kernelSize); 101 sumB / kernelSize);
75 #endif 102 #endif
76 if (x >= leftOffset) { 103 if (x >= leftOffset) {
77 SkColor l = *(sptr - leftOffset); 104 SkColor l = *(sptr - leftOffset * srcStrideX);
78 sumA -= SkGetPackedA32(l); 105 sumA -= SkGetPackedA32(l);
79 sumR -= SkGetPackedR32(l); 106 sumR -= SkGetPackedR32(l);
80 sumG -= SkGetPackedG32(l); 107 sumG -= SkGetPackedG32(l);
81 sumB -= SkGetPackedB32(l); 108 sumB -= SkGetPackedB32(l);
82 } 109 }
83 if (x + rightOffset + 1 < width) { 110 if (x + rightOffset + 1 < width) {
84 SkColor r = *(sptr + rightOffset + 1); 111 SkColor r = *(sptr + (rightOffset + 1) * srcStrideX);
85 sumA += SkGetPackedA32(r); 112 sumA += SkGetPackedA32(r);
86 sumR += SkGetPackedR32(r); 113 sumR += SkGetPackedR32(r);
87 sumG += SkGetPackedG32(r); 114 sumG += SkGetPackedG32(r);
88 sumB += SkGetPackedB32(r); 115 sumB += SkGetPackedB32(r);
89 } 116 }
90 sptr++; 117 sptr += srcStrideX;
91 dptr++; 118 if (srcDirection == kY) {
119 SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);
120 }
121 dptr += dstStrideX;
92 } 122 }
123 src += srcStrideY;
124 dst += dstStrideY;
93 } 125 }
94 } 126 }
95 127
96 static void boxBlurY(const SkBitmap& src, SkBitmap* dst, int kernelSize, 128 static void boxBlurX(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke rnelSize,
97 int topOffset, int bottomOffset, const SkIRect& bounds) 129 int leftOffset, int rightOffset, int width, int height)
98 { 130 {
99 int width = bounds.width(), height = bounds.height(); 131 boxBlur<kX, kX>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, wi dth, height);
100 int bottomBorder = SkMin32(bottomOffset + 1, height); 132 }
101 int srcStride = src.rowBytesAsPixels(); 133
102 int dstStride = dst->rowBytesAsPixels();
103 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION 134 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
104 uint32_t scale = (1 << 24) / kernelSize; 135 static void boxBlurXY(const SkPMColor* src, int srcStride, SkPMColor* dst, int k ernelSize,
105 uint32_t half = 1 << 23; 136 int leftOffset, int rightOffset, int width, int he ight)
137 {
138 boxBlur<kX, kY>(src, srcStride, dst, kernelSize, leftOffset, rightOffset, wi dth, height);
139 }
106 #endif 140 #endif
107 for (int x = 0; x < width; ++x) {
108 int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
109 SkColor* p = src.getAddr32(bounds.fLeft + x, bounds.fTop);
110 for (int i = 0; i < bottomBorder; ++i) {
111 sumA += SkGetPackedA32(*p);
112 sumR += SkGetPackedR32(*p);
113 sumG += SkGetPackedG32(*p);
114 sumB += SkGetPackedB32(*p);
115 p += srcStride;
116 }
117 141
118 const SkColor* sptr = src.getAddr32(bounds.fLeft + x, bounds.fTop); 142 static void boxBlurY(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke rnelSize,
119 SkColor* dptr = dst->getAddr32(x, 0); 143 int topOffset, int bottomOffset, int width, int height)
120 for (int y = 0; y < height; ++y) { 144 {
121 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION 145 boxBlur<kY, kY>(src, srcStride, dst, kernelSize, topOffset, bottomOffset, wi dth, height);
122 *dptr = SkPackARGB32((sumA * scale + half) >> 24,
123 (sumR * scale + half) >> 24,
124 (sumG * scale + half) >> 24,
125 (sumB * scale + half) >> 24);
126 #else
127 *dptr = SkPackARGB32(sumA / kernelSize,
128 sumR / kernelSize,
129 sumG / kernelSize,
130 sumB / kernelSize);
131 #endif
132 if (y >= topOffset) {
133 SkColor l = *(sptr - topOffset * srcStride);
134 sumA -= SkGetPackedA32(l);
135 sumR -= SkGetPackedR32(l);
136 sumG -= SkGetPackedG32(l);
137 sumB -= SkGetPackedB32(l);
138 }
139 if (y + bottomOffset + 1 < height) {
140 SkColor r = *(sptr + (bottomOffset + 1) * srcStride);
141 sumA += SkGetPackedA32(r);
142 sumR += SkGetPackedR32(r);
143 sumG += SkGetPackedG32(r);
144 sumB += SkGetPackedB32(r);
145 }
146 sptr += srcStride;
147 // The next leading pixel seems to be too hard to predict. Hint the fetch.
148 SK_PREFETCH(sptr + (bottomOffset + 1) * srcStride);
149 dptr += dstStride;
150 }
151 }
152 } 146 }
153 147
154 static void getBox3Params(SkScalar s, int *kernelSize, int* kernelSize3, int *lo wOffset, 148 static void getBox3Params(SkScalar s, int *kernelSize, int* kernelSize3, int *lo wOffset,
155 int *highOffset) 149 int *highOffset)
156 { 150 {
157 float pi = SkScalarToFloat(SK_ScalarPI); 151 float pi = SkScalarToFloat(SK_ScalarPI);
158 int d = static_cast<int>(floorf(SkScalarToFloat(s) * 3.0f * sqrtf(2.0f * pi) / 4.0f + 0.5f)); 152 int d = static_cast<int>(floorf(SkScalarToFloat(s) * 3.0f * sqrtf(2.0f * pi) / 4.0f + 0.5f));
159 *kernelSize = d; 153 *kernelSize = d;
160 if (d % 2 == 1) { 154 if (d % 2 == 1) {
161 *lowOffset = *highOffset = (d - 1) / 2; 155 *lowOffset = *highOffset = (d - 1) / 2;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
206 src.copyTo(dst, dst->config()); 200 src.copyTo(dst, dst->config());
207 return true; 201 return true;
208 } 202 }
209 203
210 SkBitmap temp; 204 SkBitmap temp;
211 temp.setConfig(dst->config(), dst->width(), dst->height()); 205 temp.setConfig(dst->config(), dst->width(), dst->height());
212 if (!temp.allocPixels()) { 206 if (!temp.allocPixels()) {
213 return false; 207 return false;
214 } 208 }
215 209
210 const SkPMColor* s = src.getAddr32(srcBounds.left(), srcBounds.top());
211 SkPMColor* t = temp.getAddr32(0, 0);
212 SkPMColor* d = dst->getAddr32(0, 0);
213 int w = dstBounds.width(), h = dstBounds.height();
214 int sw = src.rowBytesAsPixels();
216 if (kernelSizeX > 0 && kernelSizeY > 0) { 215 if (kernelSizeX > 0 && kernelSizeY > 0) {
217 boxBlurX(src, &temp, kernelSizeX, lowOffsetX, highOffsetX, srcBounds) ; 216 #ifndef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
218 boxBlurY(temp, dst, kernelSizeY, lowOffsetY, highOffsetY, dstBounds) ; 217 boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
219 boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds); 218 boxBlurX(t, w, d, kernelSizeX, highOffsetX, lowOffsetX, w, h);
220 boxBlurY(temp, dst, kernelSizeY, highOffsetY, lowOffsetY, dstBounds); 219 boxBlurXY(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
221 boxBlurX(*dst, &temp, kernelSizeX3, highOffsetX, highOffsetX, dstBounds) ; 220 boxBlurX(t, h, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
222 boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds) ; 221 boxBlurX(d, h, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
222 boxBlurXY(t, h, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
223 #else
224 boxBlurX(s, sw, t, kernelSizeX, lowOffsetX, highOffsetX, w, h);
225 boxBlurY(t, w, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
226 boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
227 boxBlurY(t, w, d, kernelSizeY, highOffsetY, lowOffsetY, h, w);
228 boxBlurX(d, w, t, kernelSizeX3, highOffsetX, highOffsetX, w, h);
229 boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
230 #endif
223 } else if (kernelSizeX > 0) { 231 } else if (kernelSizeX > 0) {
224 boxBlurX(src, dst, kernelSizeX, lowOffsetX, highOffsetX, srcBounds) ; 232 boxBlurX(s, sw, d, kernelSizeX, lowOffsetX, highOffsetX, w, h);
225 boxBlurX(*dst, &temp, kernelSizeX, highOffsetX, lowOffsetX, dstBounds); 233 boxBlurX(d, w, t, kernelSizeX, highOffsetX, lowOffsetX, w, h);
226 boxBlurX(temp, dst, kernelSizeX3, highOffsetX, highOffsetX, dstBounds) ; 234 boxBlurX(t, w, d, kernelSizeX3, highOffsetX, highOffsetX, w, h);
227 } else if (kernelSizeY > 0) { 235 } else if (kernelSizeY > 0) {
228 boxBlurY(src, dst, kernelSizeY, lowOffsetY, highOffsetY, srcBounds) ; 236 boxBlurY(s, sw, d, kernelSizeY, lowOffsetY, highOffsetY, h, w);
229 boxBlurY(*dst, &temp, kernelSizeY, highOffsetY, lowOffsetY, dstBounds); 237 boxBlurY(d, w, t, kernelSizeY, highOffsetY, lowOffsetY, h, w);
230 boxBlurY(temp, dst, kernelSizeY3, highOffsetY, highOffsetY, dstBounds) ; 238 boxBlurY(t, w, d, kernelSizeY3, highOffsetY, highOffsetY, h, w);
231 } 239 }
232 offset->fX += srcBounds.fLeft; 240 offset->fX += srcBounds.fLeft;
233 offset->fY += srcBounds.fTop; 241 offset->fY += srcBounds.fTop;
234 return true; 242 return true;
235 } 243 }
236 244
237 bool SkBlurImageFilter::filterImageGPU(Proxy* proxy, const SkBitmap& src, const SkMatrix& ctm, 245 bool SkBlurImageFilter::filterImageGPU(Proxy* proxy, const SkBitmap& src, const SkMatrix& ctm,
238 SkBitmap* result, SkIPoint* offset) { 246 SkBitmap* result, SkIPoint* offset) {
239 #if SK_SUPPORT_GPU 247 #if SK_SUPPORT_GPU
240 SkBitmap input; 248 SkBitmap input;
(...skipping 14 matching lines...) Expand all
255 fSigma.width(), 263 fSigma.width(),
256 fSigma.height())); 264 fSigma.height()));
257 offset->fX += rect.fLeft; 265 offset->fX += rect.fLeft;
258 offset->fY += rect.fTop; 266 offset->fY += rect.fTop;
259 return SkImageFilterUtils::WrapTexture(tex, rect.width(), rect.height(), res ult); 267 return SkImageFilterUtils::WrapTexture(tex, rect.width(), rect.height(), res ult);
260 #else 268 #else
261 SkDEBUGFAIL("Should not call in GPU-less build"); 269 SkDEBUGFAIL("Should not call in GPU-less build");
262 return false; 270 return false;
263 #endif 271 #endif
264 } 272 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698