OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkBlurImageFilter_opts_DEFINED | 8 #ifndef SkBlurImageFilter_opts_DEFINED |
9 #define SkBlurImageFilter_opts_DEFINED | 9 #define SkBlurImageFilter_opts_DEFINED |
10 | 10 |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
77 vst1_lane_u32(dptr + width, px2, 1); \ | 77 vst1_lane_u32(dptr + width, px2, 1); \ |
78 } else { \ | 78 } else { \ |
79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ | 79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ |
80 } | 80 } |
81 | 81 |
82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) | 82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) |
83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) | 83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) |
84 | 84 |
85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. | 85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. |
86 template<BlurDirection srcDirection, BlurDirection dstDirection> | 86 template<BlurDirection srcDirection, BlurDirection dstDirection> |
87 int box_blur_double(const SkPMColor** src, int srcStride, const SkIRect& srcBoun
ds, SkPMColor** dst, int kernelSize, | 87 void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int
kernelSize, |
88 int leftOffset, int rightOffset, int width, int height) { | 88 int leftOffset, int rightOffset, int width, int* height) { |
89 // Load 2 pixels from adjacent rows. | 89 // Load 2 pixels from adjacent rows. |
90 auto load_2_pixels = [&](const SkPMColor* s) { | 90 auto load_2_pixels = [&](const SkPMColor* s) { |
91 if (srcDirection == BlurDirection::kX) { | 91 if (srcDirection == BlurDirection::kX) { |
92 // 10% faster by adding these 2 prefetches | 92 // 10% faster by adding these 2 prefetches |
93 SK_PREFETCH(s + 16); | 93 SK_PREFETCH(s + 16); |
94 SK_PREFETCH(s + 16 + srcStride); | 94 SK_PREFETCH(s + 16 + srcStride); |
95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), | 95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), |
96 two = vld1_lane_u32(s + srcStride, one, 1); | 96 two = vld1_lane_u32(s + srcStride, one, 1); |
97 return vreinterpret_u8_u32(two); | 97 return vreinterpret_u8_u32(two); |
98 } else { | 98 } else { |
99 return vld1_u8((uint8_t*)s); | 99 return vld1_u8((uint8_t*)s); |
100 } | 100 } |
101 }; | 101 }; |
102 int left = srcBounds.left(); | 102 int incrementStart = SkMax32(-rightOffset - 1, -width); |
103 int right = srcBounds.right(); | 103 int incrementEnd = SkMax32(width - rightOffset - 1, 0); |
104 int top = srcBounds.top(); | 104 int decrementStart = SkMin32(leftOffset, width); |
105 int bottom = srcBounds.bottom(); | |
106 int incrementStart = SkMax32(left - rightOffset - 1, -width); | |
107 int incrementEnd = SkMax32(right - rightOffset - 1, 0); | |
108 int decrementStart = SkMin32(left + leftOffset, width); | |
109 int decrementEnd = SkMin32(right + leftOffset, width); | |
110 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 105 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
111 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; | 106 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : *height; |
112 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 107 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
113 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 108 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
114 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); | 109 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); |
115 | 110 |
116 for (; bottom - top >= 2; top += 2) { | 111 for (; *height >= 2; *height -= 2) { |
117 uint16x8_t sum = vdupq_n_u16(0); | 112 uint16x8_t sum = vdupq_n_u16(0); |
118 const SkPMColor* lptr = *src; | 113 const SkPMColor* lptr = *src; |
119 const SkPMColor* rptr = *src; | 114 const SkPMColor* rptr = *src; |
120 SkPMColor* dptr = *dst; | 115 SkPMColor* dptr = *dst; |
121 int x; | 116 int x; |
122 for (x = incrementStart; x < 0; ++x) { | 117 for (x = incrementStart; x < 0; ++x) { |
123 INCREMENT_SUMS_DOUBLE(rptr); | 118 INCREMENT_SUMS_DOUBLE(rptr); |
124 rptr += srcStrideX; | 119 rptr += srcStrideX; |
125 } | 120 } |
126 // Clear to zero when sampling to the left our domain. "sum" is zero her
e because we | |
127 // initialized it above, and the preceeding loop has no effect in this c
ase. | |
128 for (x = 0; x < incrementStart; ++x) { | |
129 STORE_SUMS_DOUBLE | |
130 dptr += dstStrideX; | |
131 } | |
132 for (; x < decrementStart && x < incrementEnd; ++x) { | 121 for (; x < decrementStart && x < incrementEnd; ++x) { |
133 STORE_SUMS_DOUBLE | 122 STORE_SUMS_DOUBLE |
134 dptr += dstStrideX; | 123 dptr += dstStrideX; |
135 INCREMENT_SUMS_DOUBLE(rptr); | 124 INCREMENT_SUMS_DOUBLE(rptr); |
136 rptr += srcStrideX; | 125 rptr += srcStrideX; |
137 } | 126 } |
138 for (x = decrementStart; x < incrementEnd; ++x) { | 127 for (x = decrementStart; x < incrementEnd; ++x) { |
139 STORE_SUMS_DOUBLE | 128 STORE_SUMS_DOUBLE |
140 dptr += dstStrideX; | 129 dptr += dstStrideX; |
141 INCREMENT_SUMS_DOUBLE(rptr); | 130 INCREMENT_SUMS_DOUBLE(rptr); |
142 rptr += srcStrideX; | 131 rptr += srcStrideX; |
143 DECREMENT_SUMS_DOUBLE(lptr); | 132 DECREMENT_SUMS_DOUBLE(lptr); |
144 lptr += srcStrideX; | 133 lptr += srcStrideX; |
145 } | 134 } |
146 for (x = incrementEnd; x < decrementStart; ++x) { | 135 for (x = incrementEnd; x < decrementStart; ++x) { |
147 STORE_SUMS_DOUBLE | 136 STORE_SUMS_DOUBLE |
148 dptr += dstStrideX; | 137 dptr += dstStrideX; |
149 } | 138 } |
150 for (; x < decrementEnd; ++x) { | 139 for (; x < width; ++x) { |
151 STORE_SUMS_DOUBLE | 140 STORE_SUMS_DOUBLE |
152 dptr += dstStrideX; | 141 dptr += dstStrideX; |
153 DECREMENT_SUMS_DOUBLE(lptr); | 142 DECREMENT_SUMS_DOUBLE(lptr); |
154 lptr += srcStrideX; | 143 lptr += srcStrideX; |
155 } | 144 } |
156 // Clear to zero when sampling to the right of our domain. "sum" is zero
here because we | |
157 // added on then subtracted off all of the pixels, leaving zero. | |
158 for (; x < width; ++x) { | |
159 STORE_SUMS_DOUBLE | |
160 dptr += dstStrideX; | |
161 } | |
162 *src += srcStrideY * 2; | 145 *src += srcStrideY * 2; |
163 *dst += dstStrideY * 2; | 146 *dst += dstStrideY * 2; |
164 } | 147 } |
165 return top; | |
166 } | 148 } |
167 | 149 |
168 // ARGB -> 0A0R 0G0B | 150 // ARGB -> 0A0R 0G0B |
169 static inline uint16x4_t expand(SkPMColor p) { | 151 static inline uint16x4_t expand(SkPMColor p) { |
170 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); | 152 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); |
171 }; | 153 }; |
172 | 154 |
173 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); | 155 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); |
174 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); | 156 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); |
175 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); | 157 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); |
176 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); | 158 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); |
177 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); | 159 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); |
178 | 160 |
179 #define STORE_SUMS \ | 161 #define STORE_SUMS \ |
180 uint32x4_t result = vmlaq_u32(half, sum, scale); \ | 162 uint32x4_t result = vmlaq_u32(half, sum, scale); \ |
181 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ | 163 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ |
182 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ | 164 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ |
183 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); | 165 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); |
184 | 166 |
185 #define DOUBLE_ROW_OPTIMIZATION \ | 167 #define DOUBLE_ROW_OPTIMIZATION \ |
186 if (1 < kernelSize && kernelSize < 128) { \ | 168 if (1 < kernelSize && kernelSize < 128) { \ |
187 top = box_blur_double<srcDirection, dstDirection>(&src, srcStride, srcBo
unds, &dst, \ | 169 box_blur_double<srcDirection, dstDirection>(&src, srcStride, &dst, kerne
lSize, \ |
188 kernelSize, leftOffset
, rightOffset, \ | 170 leftOffset, rightOffset, wid
th, &height); \ |
189 width, height); \ | |
190 } | 171 } |
191 | 172 |
192 #else // Neither NEON nor >=SSE2. | 173 #else // Neither NEON nor >=SSE2. |
193 | 174 |
194 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; | 175 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; |
195 #define INIT_HALF uint32_t half = 1 << 23; | 176 #define INIT_HALF uint32_t half = 1 << 23; |
196 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; | 177 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; |
197 #define INCREMENT_SUMS(c) \ | 178 #define INCREMENT_SUMS(c) \ |
198 sumA += SkGetPackedA32(c); \ | 179 sumA += SkGetPackedA32(c); \ |
199 sumR += SkGetPackedR32(c); \ | 180 sumR += SkGetPackedR32(c); \ |
(...skipping 12 matching lines...) Expand all Loading... |
212 #define DOUBLE_ROW_OPTIMIZATION | 193 #define DOUBLE_ROW_OPTIMIZATION |
213 | 194 |
214 #endif | 195 #endif |
215 | 196 |
216 #define PREFETCH_RPTR \ | 197 #define PREFETCH_RPTR \ |
217 if (srcDirection == BlurDirection::kY) { \ | 198 if (srcDirection == BlurDirection::kY) { \ |
218 SK_PREFETCH(rptr); \ | 199 SK_PREFETCH(rptr); \ |
219 } | 200 } |
220 | 201 |
221 template<BlurDirection srcDirection, BlurDirection dstDirection> | 202 template<BlurDirection srcDirection, BlurDirection dstDirection> |
222 static void box_blur(const SkPMColor* src, int srcStride, const SkIRect& srcBoun
ds, SkPMColor* dst, | 203 static void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke
rnelSize, |
223 int kernelSize, int leftOffset, int rightOffset, int width,
int height) { | 204 int leftOffset, int rightOffset, int width, int height) { |
224 int left = srcBounds.left(); | 205 int incrementStart = SkMax32(-rightOffset - 1, -width); |
225 int right = srcBounds.right(); | 206 int incrementEnd = SkMax32(width - rightOffset - 1, 0); |
226 int top = srcBounds.top(); | 207 int decrementStart = SkMin32(leftOffset, width); |
227 int bottom = srcBounds.bottom(); | |
228 int incrementStart = SkMax32(left - rightOffset - 1, -width); | |
229 int incrementEnd = SkMax32(right - rightOffset - 1, 0); | |
230 int decrementStart = SkMin32(left + leftOffset, width); | |
231 int decrementEnd = SkMin32(right + leftOffset, width); | |
232 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 208 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
233 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; | 209 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; |
234 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 210 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
235 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 211 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
236 INIT_SCALE | 212 INIT_SCALE |
237 INIT_HALF | 213 INIT_HALF |
238 | 214 |
239 // Clear to zero when sampling above our domain. | |
240 for (int y = 0; y < top; y++) { | |
241 SkColor* dptr = dst; | |
242 for (int x = 0; x < width; ++x) { | |
243 *dptr = 0; | |
244 dptr += dstStrideX; | |
245 } | |
246 dst += dstStrideY; | |
247 } | |
248 | |
249 DOUBLE_ROW_OPTIMIZATION | 215 DOUBLE_ROW_OPTIMIZATION |
250 | 216 |
251 for (int y = top; y < bottom; ++y) { | 217 for (int y = 0; y < height; ++y) { |
252 INIT_SUMS | 218 INIT_SUMS |
253 const SkPMColor* lptr = src; | 219 const SkPMColor* lptr = src; |
254 const SkPMColor* rptr = src; | 220 const SkPMColor* rptr = src; |
255 SkColor* dptr = dst; | 221 SkColor* dptr = dst; |
256 int x; | 222 int x; |
257 for (x = incrementStart; x < 0; ++x) { | 223 for (x = incrementStart; x < 0; ++x) { |
258 INCREMENT_SUMS(*rptr); | 224 INCREMENT_SUMS(*rptr); |
259 rptr += srcStrideX; | 225 rptr += srcStrideX; |
260 PREFETCH_RPTR | 226 PREFETCH_RPTR |
261 } | 227 } |
262 // Clear to zero when sampling to the left of our domain. | |
263 for (x = 0; x < incrementStart; ++x) { | |
264 *dptr = 0; | |
265 dptr += dstStrideX; | |
266 } | |
267 for (; x < decrementStart && x < incrementEnd; ++x) { | 228 for (; x < decrementStart && x < incrementEnd; ++x) { |
268 STORE_SUMS | 229 STORE_SUMS |
269 dptr += dstStrideX; | 230 dptr += dstStrideX; |
270 INCREMENT_SUMS(*rptr); | 231 INCREMENT_SUMS(*rptr); |
271 rptr += srcStrideX; | 232 rptr += srcStrideX; |
272 PREFETCH_RPTR | 233 PREFETCH_RPTR |
273 } | 234 } |
274 for (x = decrementStart; x < incrementEnd; ++x) { | 235 for (x = decrementStart; x < incrementEnd; ++x) { |
275 STORE_SUMS | 236 STORE_SUMS |
276 dptr += dstStrideX; | 237 dptr += dstStrideX; |
277 INCREMENT_SUMS(*rptr); | 238 INCREMENT_SUMS(*rptr); |
278 rptr += srcStrideX; | 239 rptr += srcStrideX; |
279 PREFETCH_RPTR | 240 PREFETCH_RPTR |
280 DECREMENT_SUMS(*lptr); | 241 DECREMENT_SUMS(*lptr); |
281 lptr += srcStrideX; | 242 lptr += srcStrideX; |
282 } | 243 } |
283 for (x = incrementEnd; x < decrementStart; ++x) { | 244 for (x = incrementEnd; x < decrementStart; ++x) { |
284 STORE_SUMS | 245 STORE_SUMS |
285 dptr += dstStrideX; | 246 dptr += dstStrideX; |
286 } | 247 } |
287 for (; x < decrementEnd; ++x) { | 248 for (; x < width; ++x) { |
288 STORE_SUMS | 249 STORE_SUMS |
289 dptr += dstStrideX; | 250 dptr += dstStrideX; |
290 DECREMENT_SUMS(*lptr); | 251 DECREMENT_SUMS(*lptr); |
291 lptr += srcStrideX; | 252 lptr += srcStrideX; |
292 } | 253 } |
293 // Clear to zero when sampling to the right of our domain. | |
294 for (; x < width; ++x) { | |
295 *dptr = 0; | |
296 dptr += dstStrideX; | |
297 } | |
298 src += srcStrideY; | 254 src += srcStrideY; |
299 dst += dstStrideY; | 255 dst += dstStrideY; |
300 } | 256 } |
301 // Clear to zero when sampling below our domain. | |
302 for (int y = bottom; y < height; ++y) { | |
303 SkColor* dptr = dst; | |
304 for (int x = 0; x < width; ++x) { | |
305 *dptr = 0; | |
306 dptr += dstStrideX; | |
307 } | |
308 dst += dstStrideY; | |
309 } | |
310 } | 257 } |
311 | 258 |
312 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, | 259 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, |
313 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, | 260 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, |
314 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; | 261 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; |
315 | 262 |
316 } // namespace SK_OPTS_NS | 263 } // namespace SK_OPTS_NS |
317 | 264 |
318 #endif | 265 #endif |
OLD | NEW |