OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkBlurImageFilter_opts_DEFINED | 8 #ifndef SkBlurImageFilter_opts_DEFINED |
9 #define SkBlurImageFilter_opts_DEFINED | 9 #define SkBlurImageFilter_opts_DEFINED |
10 | 10 |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
77 vst1_lane_u32(dptr + width, px2, 1); \ | 77 vst1_lane_u32(dptr + width, px2, 1); \ |
78 } else { \ | 78 } else { \ |
79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ | 79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ |
80 } | 80 } |
81 | 81 |
82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) | 82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) |
83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) | 83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) |
84 | 84 |
85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. | 85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. |
86 template<BlurDirection srcDirection, BlurDirection dstDirection> | 86 template<BlurDirection srcDirection, BlurDirection dstDirection> |
87 void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize, | 87 int box_blur_double(const SkPMColor** src, int srcStride, const SkIRect& srcBoun ds, SkPMColor** dst, int kernelSize, |
88 int leftOffset, int rightOffset, int width, int* height) { | 88 int leftOffset, int rightOffset, int width, int height) { |
89 // Load 2 pixels from adjacent rows. | 89 // Load 2 pixels from adjacent rows. |
90 auto load_2_pixels = [&](const SkPMColor* s) { | 90 auto load_2_pixels = [&](const SkPMColor* s) { |
91 if (srcDirection == BlurDirection::kX) { | 91 if (srcDirection == BlurDirection::kX) { |
92 // 10% faster by adding these 2 prefetches | 92 // 10% faster by adding these 2 prefetches |
93 SK_PREFETCH(s + 16); | 93 SK_PREFETCH(s + 16); |
94 SK_PREFETCH(s + 16 + srcStride); | 94 SK_PREFETCH(s + 16 + srcStride); |
95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), | 95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), |
96 two = vld1_lane_u32(s + srcStride, one, 1); | 96 two = vld1_lane_u32(s + srcStride, one, 1); |
97 return vreinterpret_u8_u32(two); | 97 return vreinterpret_u8_u32(two); |
98 } else { | 98 } else { |
99 return vld1_u8((uint8_t*)s); | 99 return vld1_u8((uint8_t*)s); |
100 } | 100 } |
101 }; | 101 }; |
102 int incrementStart = SkMax32(-rightOffset - 1, -width); | 102 int left = srcBounds.left(); |
103 int incrementEnd = SkMax32(width - rightOffset - 1, 0); | 103 int right = srcBounds.right(); |
104 int decrementStart = SkMin32(leftOffset, width); | 104 int top = srcBounds.top(); |
105 int bottom = srcBounds.bottom(); | |
106 int incrementStart = SkMax32(left - rightOffset - 1, -width); | |
107 int incrementEnd = SkMax32(right - rightOffset - 1, 0); | |
108 int decrementStart = SkMin32(left + leftOffset, width); | |
109 int decrementEnd = SkMin32(right + leftOffset, width); | |
105 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 110 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
106 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : *height; | 111 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; |
107 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 112 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
108 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 113 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
109 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); | 114 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); |
110 | 115 |
111 for (; *height >= 2; *height -= 2) { | 116 for (; bottom - top >= 2; top += 2) { |
112 uint16x8_t sum = vdupq_n_u16(0); | 117 uint16x8_t sum = vdupq_n_u16(0); |
113 const SkPMColor* lptr = *src; | 118 const SkPMColor* lptr = *src; |
114 const SkPMColor* rptr = *src; | 119 const SkPMColor* rptr = *src; |
115 SkPMColor* dptr = *dst; | 120 SkPMColor* dptr = *dst; |
116 int x; | 121 int x; |
117 for (x = incrementStart; x < 0; ++x) { | 122 for (x = incrementStart; x < 0; ++x) { |
118 INCREMENT_SUMS_DOUBLE(rptr); | 123 INCREMENT_SUMS_DOUBLE(rptr); |
119 rptr += srcStrideX; | 124 rptr += srcStrideX; |
120 } | 125 } |
126 for (x = 0; x < incrementStart; ++x) { | |
127 STORE_SUMS_DOUBLE | |
128 dptr += dstStrideX; | |
129 } | |
121 for (; x < decrementStart && x < incrementEnd; ++x) { | 130 for (; x < decrementStart && x < incrementEnd; ++x) { |
122 STORE_SUMS_DOUBLE | 131 STORE_SUMS_DOUBLE |
123 dptr += dstStrideX; | 132 dptr += dstStrideX; |
124 INCREMENT_SUMS_DOUBLE(rptr); | 133 INCREMENT_SUMS_DOUBLE(rptr); |
125 rptr += srcStrideX; | 134 rptr += srcStrideX; |
126 } | 135 } |
127 for (x = decrementStart; x < incrementEnd; ++x) { | 136 for (x = decrementStart; x < incrementEnd; ++x) { |
128 STORE_SUMS_DOUBLE | 137 STORE_SUMS_DOUBLE |
129 dptr += dstStrideX; | 138 dptr += dstStrideX; |
130 INCREMENT_SUMS_DOUBLE(rptr); | 139 INCREMENT_SUMS_DOUBLE(rptr); |
131 rptr += srcStrideX; | 140 rptr += srcStrideX; |
132 DECREMENT_SUMS_DOUBLE(lptr); | 141 DECREMENT_SUMS_DOUBLE(lptr); |
133 lptr += srcStrideX; | 142 lptr += srcStrideX; |
134 } | 143 } |
135 for (x = incrementEnd; x < decrementStart; ++x) { | 144 for (x = incrementEnd; x < decrementStart; ++x) { |
136 STORE_SUMS_DOUBLE | 145 STORE_SUMS_DOUBLE |
137 dptr += dstStrideX; | 146 dptr += dstStrideX; |
138 } | 147 } |
139 for (; x < width; ++x) { | 148 for (; x < decrementEnd; ++x) { |
140 STORE_SUMS_DOUBLE | 149 STORE_SUMS_DOUBLE |
141 dptr += dstStrideX; | 150 dptr += dstStrideX; |
142 DECREMENT_SUMS_DOUBLE(lptr); | 151 DECREMENT_SUMS_DOUBLE(lptr); |
143 lptr += srcStrideX; | 152 lptr += srcStrideX; |
144 } | 153 } |
154 for (; x < width; ++x) { | |
155 STORE_SUMS_DOUBLE | |
156 dptr += dstStrideX; | |
157 } | |
145 *src += srcStrideY * 2; | 158 *src += srcStrideY * 2; |
146 *dst += dstStrideY * 2; | 159 *dst += dstStrideY * 2; |
147 } | 160 } |
161 return top; | |
148 } | 162 } |
149 | 163 |
150 // ARGB -> 0A0R 0G0B | 164 // ARGB -> 0A0R 0G0B |
151 static inline uint16x4_t expand(SkPMColor p) { | 165 static inline uint16x4_t expand(SkPMColor p) { |
152 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); | 166 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); |
153 }; | 167 }; |
154 | 168 |
155 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); | 169 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); |
156 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); | 170 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); |
157 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); | 171 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); |
158 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); | 172 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); |
159 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); | 173 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); |
160 | 174 |
161 #define STORE_SUMS \ | 175 #define STORE_SUMS \ |
162 uint32x4_t result = vmlaq_u32(half, sum, scale); \ | 176 uint32x4_t result = vmlaq_u32(half, sum, scale); \ |
163 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ | 177 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ |
164 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ | 178 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ |
165 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); | 179 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); |
166 | 180 |
167 #define DOUBLE_ROW_OPTIMIZATION \ | 181 #define DOUBLE_ROW_OPTIMIZATION \ |
168 if (1 < kernelSize && kernelSize < 128) { \ | 182 if (1 < kernelSize && kernelSize < 128) { \ |
169 box_blur_double<srcDirection, dstDirection>(&src, srcStride, &dst, kerne lSize, \ | 183 top = box_blur_double<srcDirection, dstDirection>(&src, srcStride, srcBo unds, &dst, \ |
170 leftOffset, rightOffset, wid th, &height); \ | 184 kernelSize, leftOffset , rightOffset, \ |
185 width, height); \ | |
171 } | 186 } |
172 | 187 |
173 #else // Neither NEON nor >=SSE2. | 188 #else // Neither NEON nor >=SSE2. |
174 | 189 |
175 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; | 190 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; |
176 #define INIT_HALF uint32_t half = 1 << 23; | 191 #define INIT_HALF uint32_t half = 1 << 23; |
177 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; | 192 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; |
178 #define INCREMENT_SUMS(c) \ | 193 #define INCREMENT_SUMS(c) \ |
179 sumA += SkGetPackedA32(c); \ | 194 sumA += SkGetPackedA32(c); \ |
180 sumR += SkGetPackedR32(c); \ | 195 sumR += SkGetPackedR32(c); \ |
(...skipping 12 matching lines...) Expand all Loading... | |
193 #define DOUBLE_ROW_OPTIMIZATION | 208 #define DOUBLE_ROW_OPTIMIZATION |
194 | 209 |
195 #endif | 210 #endif |
196 | 211 |
197 #define PREFETCH_RPTR \ | 212 #define PREFETCH_RPTR \ |
198 if (srcDirection == BlurDirection::kY) { \ | 213 if (srcDirection == BlurDirection::kY) { \ |
199 SK_PREFETCH(rptr); \ | 214 SK_PREFETCH(rptr); \ |
200 } | 215 } |
201 | 216 |
202 template<BlurDirection srcDirection, BlurDirection dstDirection> | 217 template<BlurDirection srcDirection, BlurDirection dstDirection> |
203 static void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke rnelSize, | 218 static void box_blur(const SkPMColor* src, int srcStride, const SkIRect& srcBoun ds, SkPMColor* dst, |
204 int leftOffset, int rightOffset, int width, int height) { | 219 int kernelSize, int leftOffset, int rightOffset, int width, int height) { |
205 int incrementStart = SkMax32(-rightOffset - 1, -width); | 220 int left = srcBounds.left(); |
206 int incrementEnd = SkMax32(width - rightOffset - 1, 0); | 221 int right = srcBounds.right(); |
207 int decrementStart = SkMin32(leftOffset, width); | 222 int top = srcBounds.top(); |
223 int bottom = srcBounds.bottom(); | |
224 int incrementStart = SkMax32(left - rightOffset - 1, -width); | |
225 int incrementEnd = SkMax32(right - rightOffset - 1, 0); | |
226 int decrementStart = SkMin32(left + leftOffset, width); | |
227 int decrementEnd = SkMin32(right + leftOffset, width); | |
208 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 228 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
209 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; | 229 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; |
210 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 230 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
211 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 231 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
212 INIT_SCALE | 232 INIT_SCALE |
213 INIT_HALF | 233 INIT_HALF |
214 | 234 |
235 for (int y = 0; y < top; y++) { | |
reed1
2015/11/02 15:05:07
// fill in with zeros where we're sampling above o
Stephen White
2015/11/02 15:23:13
Done.
| |
236 SkColor* dptr = dst; | |
237 for (int x = 0; x < width; ++x) { | |
238 *dptr = 0; | |
239 dptr += dstStrideX; | |
240 } | |
241 dst += dstStrideY; | |
242 } | |
243 | |
215 DOUBLE_ROW_OPTIMIZATION | 244 DOUBLE_ROW_OPTIMIZATION |
216 | 245 |
217 for (int y = 0; y < height; ++y) { | 246 for (int y = top; y < bottom; ++y) { |
218 INIT_SUMS | 247 INIT_SUMS |
219 const SkPMColor* lptr = src; | 248 const SkPMColor* lptr = src; |
220 const SkPMColor* rptr = src; | 249 const SkPMColor* rptr = src; |
221 SkColor* dptr = dst; | 250 SkColor* dptr = dst; |
222 int x; | 251 int x; |
223 for (x = incrementStart; x < 0; ++x) { | 252 for (x = incrementStart; x < 0; ++x) { |
224 INCREMENT_SUMS(*rptr); | 253 INCREMENT_SUMS(*rptr); |
225 rptr += srcStrideX; | 254 rptr += srcStrideX; |
226 PREFETCH_RPTR | 255 PREFETCH_RPTR |
227 } | 256 } |
257 for (x = 0; x < incrementStart; ++x) { | |
reed1
2015/11/02 15:05:07
// fill in with zeros where we're sampling to the
Stephen White
2015/11/02 15:23:13
Done.
| |
258 *dptr = 0; | |
259 dptr += dstStrideX; | |
260 } | |
228 for (; x < decrementStart && x < incrementEnd; ++x) { | 261 for (; x < decrementStart && x < incrementEnd; ++x) { |
229 STORE_SUMS | 262 STORE_SUMS |
230 dptr += dstStrideX; | 263 dptr += dstStrideX; |
231 INCREMENT_SUMS(*rptr); | 264 INCREMENT_SUMS(*rptr); |
232 rptr += srcStrideX; | 265 rptr += srcStrideX; |
233 PREFETCH_RPTR | 266 PREFETCH_RPTR |
234 } | 267 } |
235 for (x = decrementStart; x < incrementEnd; ++x) { | 268 for (x = decrementStart; x < incrementEnd; ++x) { |
236 STORE_SUMS | 269 STORE_SUMS |
237 dptr += dstStrideX; | 270 dptr += dstStrideX; |
238 INCREMENT_SUMS(*rptr); | 271 INCREMENT_SUMS(*rptr); |
239 rptr += srcStrideX; | 272 rptr += srcStrideX; |
240 PREFETCH_RPTR | 273 PREFETCH_RPTR |
241 DECREMENT_SUMS(*lptr); | 274 DECREMENT_SUMS(*lptr); |
242 lptr += srcStrideX; | 275 lptr += srcStrideX; |
243 } | 276 } |
244 for (x = incrementEnd; x < decrementStart; ++x) { | 277 for (x = incrementEnd; x < decrementStart; ++x) { |
245 STORE_SUMS | 278 STORE_SUMS |
246 dptr += dstStrideX; | 279 dptr += dstStrideX; |
247 } | 280 } |
248 for (; x < width; ++x) { | 281 for (; x < decrementEnd; ++x) { |
249 STORE_SUMS | 282 STORE_SUMS |
250 dptr += dstStrideX; | 283 dptr += dstStrideX; |
251 DECREMENT_SUMS(*lptr); | 284 DECREMENT_SUMS(*lptr); |
252 lptr += srcStrideX; | 285 lptr += srcStrideX; |
253 } | 286 } |
287 for (; x < width; ++x) { | |
reed1
2015/11/02 15:05:07
// fill in with zeros where we're sampling to the
Stephen White
2015/11/02 15:23:13
Done.
| |
288 *dptr = 0; | |
289 dptr += dstStrideX; | |
290 } | |
254 src += srcStrideY; | 291 src += srcStrideY; |
255 dst += dstStrideY; | 292 dst += dstStrideY; |
256 } | 293 } |
294 for (int y = bottom; y < height; ++y) { | |
reed1
2015/11/02 15:05:07
// fill in with zeros where we're sampling below o
Stephen White
2015/11/02 15:23:13
Done.
| |
295 SkColor* dptr = dst; | |
296 for (int x = 0; x < width; ++x) { | |
297 *dptr = 0; | |
298 dptr += dstStrideX; | |
299 } | |
300 dst += dstStrideY; | |
301 } | |
257 } | 302 } |
258 | 303 |
259 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, | 304 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, |
260 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, | 305 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, |
261 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; | 306 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; |
262 | 307 |
263 } // namespace SK_OPTS_NS | 308 } // namespace SK_OPTS_NS |
264 | 309 |
265 #endif | 310 #endif |
OLD | NEW |