| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkBlurImageFilter_opts_DEFINED | 8 #ifndef SkBlurImageFilter_opts_DEFINED |
| 9 #define SkBlurImageFilter_opts_DEFINED | 9 #define SkBlurImageFilter_opts_DEFINED |
| 10 | 10 |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 vst1_lane_u32(dptr + width, px2, 1); \ | 77 vst1_lane_u32(dptr + width, px2, 1); \ |
| 78 } else { \ | 78 } else { \ |
| 79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ | 79 vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ |
| 80 } | 80 } |
| 81 | 81 |
| 82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) | 82 #define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) |
| 83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) | 83 #define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) |
| 84 | 84 |
| 85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. | 85 // Fast path for kernel sizes between 2 and 127, working on two rows at a time. |
| 86 template<BlurDirection srcDirection, BlurDirection dstDirection> | 86 template<BlurDirection srcDirection, BlurDirection dstDirection> |
| 87 void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int
kernelSize, | 87 int box_blur_double(const SkPMColor** src, int srcStride, const SkIRect& srcBoun
ds, SkPMColor** dst, int kernelSize, |
| 88 int leftOffset, int rightOffset, int width, int* height) { | 88 int leftOffset, int rightOffset, int width, int height) { |
| 89 // Load 2 pixels from adjacent rows. | 89 // Load 2 pixels from adjacent rows. |
| 90 auto load_2_pixels = [&](const SkPMColor* s) { | 90 auto load_2_pixels = [&](const SkPMColor* s) { |
| 91 if (srcDirection == BlurDirection::kX) { | 91 if (srcDirection == BlurDirection::kX) { |
| 92 // 10% faster by adding these 2 prefetches | 92 // 10% faster by adding these 2 prefetches |
| 93 SK_PREFETCH(s + 16); | 93 SK_PREFETCH(s + 16); |
| 94 SK_PREFETCH(s + 16 + srcStride); | 94 SK_PREFETCH(s + 16 + srcStride); |
| 95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), | 95 auto one = vld1_lane_u32(s + 0, vdup_n_u32(0), 0), |
| 96 two = vld1_lane_u32(s + srcStride, one, 1); | 96 two = vld1_lane_u32(s + srcStride, one, 1); |
| 97 return vreinterpret_u8_u32(two); | 97 return vreinterpret_u8_u32(two); |
| 98 } else { | 98 } else { |
| 99 return vld1_u8((uint8_t*)s); | 99 return vld1_u8((uint8_t*)s); |
| 100 } | 100 } |
| 101 }; | 101 }; |
| 102 int incrementStart = SkMax32(-rightOffset - 1, -width); | 102 int left = srcBounds.left(); |
| 103 int incrementEnd = SkMax32(width - rightOffset - 1, 0); | 103 int right = srcBounds.right(); |
| 104 int decrementStart = SkMin32(leftOffset, width); | 104 int top = srcBounds.top(); |
| 105 int bottom = srcBounds.bottom(); |
| 106 int incrementStart = SkMax32(left - rightOffset - 1, left - right); |
| 107 int incrementEnd = SkMax32(right - rightOffset - 1, 0); |
| 108 int decrementStart = SkMin32(left + leftOffset, width); |
| 109 int decrementEnd = SkMin32(right + leftOffset, width); |
| 105 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 110 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
| 106 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : *height; | 111 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; |
| 107 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 112 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
| 108 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 113 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
| 109 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); | 114 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); |
| 110 | 115 |
| 111 for (; *height >= 2; *height -= 2) { | 116 for (; bottom - top >= 2; top += 2) { |
| 112 uint16x8_t sum = vdupq_n_u16(0); | 117 uint16x8_t sum = vdupq_n_u16(0); |
| 113 const SkPMColor* lptr = *src; | 118 const SkPMColor* lptr = *src; |
| 114 const SkPMColor* rptr = *src; | 119 const SkPMColor* rptr = *src; |
| 115 SkPMColor* dptr = *dst; | 120 SkPMColor* dptr = *dst; |
| 116 int x; | 121 int x; |
| 117 for (x = incrementStart; x < 0; ++x) { | 122 for (x = incrementStart; x < 0; ++x) { |
| 118 INCREMENT_SUMS_DOUBLE(rptr); | 123 INCREMENT_SUMS_DOUBLE(rptr); |
| 119 rptr += srcStrideX; | 124 rptr += srcStrideX; |
| 120 } | 125 } |
| 126 // Clear to zero when sampling to the left our domain. "sum" is zero her
e because we |
| 127 // initialized it above, and the preceeding loop has no effect in this c
ase. |
| 128 for (x = 0; x < incrementStart; ++x) { |
| 129 STORE_SUMS_DOUBLE |
| 130 dptr += dstStrideX; |
| 131 } |
| 121 for (; x < decrementStart && x < incrementEnd; ++x) { | 132 for (; x < decrementStart && x < incrementEnd; ++x) { |
| 122 STORE_SUMS_DOUBLE | 133 STORE_SUMS_DOUBLE |
| 123 dptr += dstStrideX; | 134 dptr += dstStrideX; |
| 124 INCREMENT_SUMS_DOUBLE(rptr); | 135 INCREMENT_SUMS_DOUBLE(rptr); |
| 125 rptr += srcStrideX; | 136 rptr += srcStrideX; |
| 126 } | 137 } |
| 127 for (x = decrementStart; x < incrementEnd; ++x) { | 138 for (x = decrementStart; x < incrementEnd; ++x) { |
| 128 STORE_SUMS_DOUBLE | 139 STORE_SUMS_DOUBLE |
| 129 dptr += dstStrideX; | 140 dptr += dstStrideX; |
| 130 INCREMENT_SUMS_DOUBLE(rptr); | 141 INCREMENT_SUMS_DOUBLE(rptr); |
| 131 rptr += srcStrideX; | 142 rptr += srcStrideX; |
| 132 DECREMENT_SUMS_DOUBLE(lptr); | 143 DECREMENT_SUMS_DOUBLE(lptr); |
| 133 lptr += srcStrideX; | 144 lptr += srcStrideX; |
| 134 } | 145 } |
| 135 for (x = incrementEnd; x < decrementStart; ++x) { | 146 for (x = incrementEnd; x < decrementStart; ++x) { |
| 136 STORE_SUMS_DOUBLE | 147 STORE_SUMS_DOUBLE |
| 137 dptr += dstStrideX; | 148 dptr += dstStrideX; |
| 138 } | 149 } |
| 139 for (; x < width; ++x) { | 150 for (; x < decrementEnd; ++x) { |
| 140 STORE_SUMS_DOUBLE | 151 STORE_SUMS_DOUBLE |
| 141 dptr += dstStrideX; | 152 dptr += dstStrideX; |
| 142 DECREMENT_SUMS_DOUBLE(lptr); | 153 DECREMENT_SUMS_DOUBLE(lptr); |
| 143 lptr += srcStrideX; | 154 lptr += srcStrideX; |
| 144 } | 155 } |
| 156 // Clear to zero when sampling to the right of our domain. "sum" is zero
here because we |
| 157 // added on then subtracted off all of the pixels, leaving zero. |
| 158 for (; x < width; ++x) { |
| 159 STORE_SUMS_DOUBLE |
| 160 dptr += dstStrideX; |
| 161 } |
| 145 *src += srcStrideY * 2; | 162 *src += srcStrideY * 2; |
| 146 *dst += dstStrideY * 2; | 163 *dst += dstStrideY * 2; |
| 147 } | 164 } |
| 165 return top; |
| 148 } | 166 } |
| 149 | 167 |
| 150 // ARGB -> 0A0R 0G0B | 168 // ARGB -> 0A0R 0G0B |
| 151 static inline uint16x4_t expand(SkPMColor p) { | 169 static inline uint16x4_t expand(SkPMColor p) { |
| 152 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); | 170 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p)))); |
| 153 }; | 171 }; |
| 154 | 172 |
| 155 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); | 173 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); |
| 156 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); | 174 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23); |
| 157 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); | 175 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0); |
| 158 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); | 176 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c)); |
| 159 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); | 177 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c)); |
| 160 | 178 |
| 161 #define STORE_SUMS \ | 179 #define STORE_SUMS \ |
| 162 uint32x4_t result = vmlaq_u32(half, sum, scale); \ | 180 uint32x4_t result = vmlaq_u32(half, sum, scale); \ |
| 163 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ | 181 uint16x4_t result16 = vqshrn_n_u32(result, 16); \ |
| 164 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ | 182 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \ |
| 165 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); | 183 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); |
| 166 | 184 |
| 167 #define DOUBLE_ROW_OPTIMIZATION \ | 185 #define DOUBLE_ROW_OPTIMIZATION \ |
| 168 if (1 < kernelSize && kernelSize < 128) { \ | 186 if (1 < kernelSize && kernelSize < 128) { \ |
| 169 box_blur_double<srcDirection, dstDirection>(&src, srcStride, &dst, kerne
lSize, \ | 187 top = box_blur_double<srcDirection, dstDirection>(&src, srcStride, srcBo
unds, &dst, \ |
| 170 leftOffset, rightOffset, wid
th, &height); \ | 188 kernelSize, leftOffset
, rightOffset, \ |
| 189 width, height); \ |
| 171 } | 190 } |
| 172 | 191 |
| 173 #else // Neither NEON nor >=SSE2. | 192 #else // Neither NEON nor >=SSE2. |
| 174 | 193 |
| 175 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; | 194 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize; |
| 176 #define INIT_HALF uint32_t half = 1 << 23; | 195 #define INIT_HALF uint32_t half = 1 << 23; |
| 177 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; | 196 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0; |
| 178 #define INCREMENT_SUMS(c) \ | 197 #define INCREMENT_SUMS(c) \ |
| 179 sumA += SkGetPackedA32(c); \ | 198 sumA += SkGetPackedA32(c); \ |
| 180 sumR += SkGetPackedR32(c); \ | 199 sumR += SkGetPackedR32(c); \ |
| (...skipping 12 matching lines...) Expand all Loading... |
| 193 #define DOUBLE_ROW_OPTIMIZATION | 212 #define DOUBLE_ROW_OPTIMIZATION |
| 194 | 213 |
| 195 #endif | 214 #endif |
| 196 | 215 |
| 197 #define PREFETCH_RPTR \ | 216 #define PREFETCH_RPTR \ |
| 198 if (srcDirection == BlurDirection::kY) { \ | 217 if (srcDirection == BlurDirection::kY) { \ |
| 199 SK_PREFETCH(rptr); \ | 218 SK_PREFETCH(rptr); \ |
| 200 } | 219 } |
| 201 | 220 |
| 202 template<BlurDirection srcDirection, BlurDirection dstDirection> | 221 template<BlurDirection srcDirection, BlurDirection dstDirection> |
| 203 static void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke
rnelSize, | 222 static void box_blur(const SkPMColor* src, int srcStride, const SkIRect& srcBoun
ds, SkPMColor* dst, |
| 204 int leftOffset, int rightOffset, int width, int height) { | 223 int kernelSize, int leftOffset, int rightOffset, int width,
int height) { |
| 205 int incrementStart = SkMax32(-rightOffset - 1, -width); | 224 int left = srcBounds.left(); |
| 206 int incrementEnd = SkMax32(width - rightOffset - 1, 0); | 225 int right = srcBounds.right(); |
| 207 int decrementStart = SkMin32(leftOffset, width); | 226 int top = srcBounds.top(); |
| 227 int bottom = srcBounds.bottom(); |
| 228 int incrementStart = SkMax32(left - rightOffset - 1, left - right); |
| 229 int incrementEnd = SkMax32(right - rightOffset - 1, 0); |
| 230 int decrementStart = SkMin32(left + leftOffset, width); |
| 231 int decrementEnd = SkMin32(right + leftOffset, width); |
| 208 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; | 232 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; |
| 209 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; | 233 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height; |
| 210 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; | 234 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; |
| 211 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; | 235 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1; |
| 212 INIT_SCALE | 236 INIT_SCALE |
| 213 INIT_HALF | 237 INIT_HALF |
| 214 | 238 |
| 239 // Clear to zero when sampling above our domain. |
| 240 for (int y = 0; y < top; y++) { |
| 241 SkColor* dptr = dst; |
| 242 for (int x = 0; x < width; ++x) { |
| 243 *dptr = 0; |
| 244 dptr += dstStrideX; |
| 245 } |
| 246 dst += dstStrideY; |
| 247 } |
| 248 |
| 215 DOUBLE_ROW_OPTIMIZATION | 249 DOUBLE_ROW_OPTIMIZATION |
| 216 | 250 |
| 217 for (int y = 0; y < height; ++y) { | 251 for (int y = top; y < bottom; ++y) { |
| 218 INIT_SUMS | 252 INIT_SUMS |
| 219 const SkPMColor* lptr = src; | 253 const SkPMColor* lptr = src; |
| 220 const SkPMColor* rptr = src; | 254 const SkPMColor* rptr = src; |
| 221 SkColor* dptr = dst; | 255 SkColor* dptr = dst; |
| 222 int x; | 256 int x; |
| 223 for (x = incrementStart; x < 0; ++x) { | 257 for (x = incrementStart; x < 0; ++x) { |
| 224 INCREMENT_SUMS(*rptr); | 258 INCREMENT_SUMS(*rptr); |
| 225 rptr += srcStrideX; | 259 rptr += srcStrideX; |
| 226 PREFETCH_RPTR | 260 PREFETCH_RPTR |
| 227 } | 261 } |
| 262 // Clear to zero when sampling to the left of our domain. |
| 263 for (x = 0; x < incrementStart; ++x) { |
| 264 *dptr = 0; |
| 265 dptr += dstStrideX; |
| 266 } |
| 228 for (; x < decrementStart && x < incrementEnd; ++x) { | 267 for (; x < decrementStart && x < incrementEnd; ++x) { |
| 229 STORE_SUMS | 268 STORE_SUMS |
| 230 dptr += dstStrideX; | 269 dptr += dstStrideX; |
| 231 INCREMENT_SUMS(*rptr); | 270 INCREMENT_SUMS(*rptr); |
| 232 rptr += srcStrideX; | 271 rptr += srcStrideX; |
| 233 PREFETCH_RPTR | 272 PREFETCH_RPTR |
| 234 } | 273 } |
| 235 for (x = decrementStart; x < incrementEnd; ++x) { | 274 for (x = decrementStart; x < incrementEnd; ++x) { |
| 236 STORE_SUMS | 275 STORE_SUMS |
| 237 dptr += dstStrideX; | 276 dptr += dstStrideX; |
| 238 INCREMENT_SUMS(*rptr); | 277 INCREMENT_SUMS(*rptr); |
| 239 rptr += srcStrideX; | 278 rptr += srcStrideX; |
| 240 PREFETCH_RPTR | 279 PREFETCH_RPTR |
| 241 DECREMENT_SUMS(*lptr); | 280 DECREMENT_SUMS(*lptr); |
| 242 lptr += srcStrideX; | 281 lptr += srcStrideX; |
| 243 } | 282 } |
| 244 for (x = incrementEnd; x < decrementStart; ++x) { | 283 for (x = incrementEnd; x < decrementStart; ++x) { |
| 245 STORE_SUMS | 284 STORE_SUMS |
| 246 dptr += dstStrideX; | 285 dptr += dstStrideX; |
| 247 } | 286 } |
| 248 for (; x < width; ++x) { | 287 for (; x < decrementEnd; ++x) { |
| 249 STORE_SUMS | 288 STORE_SUMS |
| 250 dptr += dstStrideX; | 289 dptr += dstStrideX; |
| 251 DECREMENT_SUMS(*lptr); | 290 DECREMENT_SUMS(*lptr); |
| 252 lptr += srcStrideX; | 291 lptr += srcStrideX; |
| 253 } | 292 } |
| 293 // Clear to zero when sampling to the right of our domain. |
| 294 for (; x < width; ++x) { |
| 295 *dptr = 0; |
| 296 dptr += dstStrideX; |
| 297 } |
| 254 src += srcStrideY; | 298 src += srcStrideY; |
| 255 dst += dstStrideY; | 299 dst += dstStrideY; |
| 256 } | 300 } |
| 301 // Clear to zero when sampling below our domain. |
| 302 for (int y = bottom; y < height; ++y) { |
| 303 SkColor* dptr = dst; |
| 304 for (int x = 0; x < width; ++x) { |
| 305 *dptr = 0; |
| 306 dptr += dstStrideX; |
| 307 } |
| 308 dst += dstStrideY; |
| 309 } |
| 257 } | 310 } |
| 258 | 311 |
| 259 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, | 312 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>, |
| 260 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, | 313 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>, |
| 261 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; | 314 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>; |
| 262 | 315 |
| 263 } // namespace SK_OPTS_NS | 316 } // namespace SK_OPTS_NS |
| 264 | 317 |
| 265 #endif | 318 #endif |
| OLD | NEW |