src/opts/SkBlurImage_opts_neon.cpp - Issue 105893003: NEON fast path for box blur

Side by Side Diff: src/opts/SkBlurImage_opts_neon.cpp

Issue 105893003: NEON fast path for box blur (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 The Android Open Source Project	2 * Copyright 2013 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8	8

9 #include "SkBitmap.h"	9 #include "SkBitmap.h"

10 #include "SkColorPriv.h"	10 #include "SkColorPriv.h"

11 #include "SkBlurImage_opts.h"	11 #include "SkBlurImage_opts.h"

12 #include "SkRect.h"	12 #include "SkRect.h"

13	13

14 #include <arm_neon.h>	14 #include <arm_neon.h>

15	15

16 namespace {	16 namespace {

17	17

18 enum BlurDirection {	18 enum BlurDirection {

19 kX, kY	19 kX, kY

20 };	20 };

21	21

22 /**	22 /**

	23 * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register

	24 * and also pre-load pixels for future read

	25 */

	26 template<BlurDirection srcDirection>

	27 inline uint8x8_t load2pixels(const SkPMColor* src, int srcStride) {
	Stephen White 2013/12/12 05:30:30 should be separated by underscores; see https://si should be separated by underscores; see https://sites.google.com/site/skiadocs/developer-documentation/contributing-c..., so load_2_pixels() zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > should be separated by underscores; see > https://sites.google.com/site/skiadocs/developer-documentation/contributing-c..., > so load_2_pixels() Done.
	28 if (srcDirection == kX) {

	29 uint32x2_t temp = vdup_n_u32(0);

	30 SK_PREFETCH(src + 16);
	Stephen White 2013/12/12 05:30:30 Are you sure this offset is correct? src is SkPMCo Are you sure this offset is correct? src is SkPMColor, so unless I've done my pointer math wrong, I think this would be prefetching 64 bytes ahead. Is that what you had in mind? At any rate, I think we should check that the prefetches are actually doing anything in either X or Y, and skip them if they aren't. There are X-only and Y-only benches which should show the effect. zheng.xu* 2013/12/12 08:15:37 Yes, I actually mean 64 bytes ahead, that's 8 loop Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > Are you sure this offset is correct? src is SkPMColor*, so unless I've done my > pointer math wrong, I think this would be prefetching 64 bytes ahead. Is that > what you had in mind? > > At any rate, I think we should check that the prefetches are actually doing > anything in either X or Y, and skip them if they aren't. There are X-only and > Y-only benches which should show the effect. Yes, I actually mean 64 bytes ahead, that's 8 loops. Because it need time to let the cache system to pre-fetch data in parallel. I've done some tests locally. 10% faster by adding 2 prefetchs in (srcDirection == kX). adding prefetch in the else branch doesn't affect the bench data I left the pre-fetch in (srcDirection == kY), only for keep the symmetry. Do you mean we'd better remove them?
	31 SK_PREFETCH(src + srcStride + 16);

	32 return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32( src, temp, 0), 1));

	33 } else {

	34 SK_PREFETCH(src + srcStride);

	35 return vld1_u8((uint8_t*)src);

	36 }

	37 }

	38

	39 /**

	40 * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows

	41 */

	42 template<BlurDirection dstDirection>

	43 inline void store2pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
	Stephen White 2013/12/12 05:30:30 store_2_pixels() store_2_pixels() zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > store_2_pixels() Done.
	44 if (dstDirection == kX) {

	45 uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));

	46 vst1_lane_u32(dst, temp, 0);

	47 vst1_lane_u32(dst + dstStride, temp, 1);

	48 } else {

	49 uint8x8_t temp = vmovn_u16(result16x8);

	50 vst1_u8((uint8_t*)dst, temp);

	51 }

	52 }

	53

	54 /**

	55 * fast path for kernel size less than 128

	56 */

	57 template<BlurDirection srcDirection, BlurDirection dstDirection>

	58 void SkFastBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
	Stephen White 2013/12/12 05:30:30 I'd prefer to avoid names like "fast" (since we of I'd prefer to avoid names like "fast" (since we often come up with faster algorithms later). Let's name it by what it does, perhaps SkDoubleRowBoxBlur_NEON. zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > I'd prefer to avoid names like "fast" (since we often come up with faster > algorithms later). Let's name it by what it does, perhaps > SkDoubleRowBoxBlur_NEON. Done.
	59 int leftOffset, int rightOffset, int width, int height)

	60 {

	61 const int rightBorder = SkMin32(rightOffset + 1, width);

	62 const int srcStrideX = srcDirection == kX ? 1 : srcStride;

	63 const int dstStrideX = dstDirection == kX ? 1 : height;

	64 const int srcStrideY = srcDirection == kX ? srcStride : 1;

	65 const int dstStrideY = dstDirection == kX ? width : 1;

	66 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);

	67

	68 int x, y;

	69 height &= ~1;
	Stephen White 2013/12/12 05:30:30 Let's just pass in pointers to src, dst and height Let's just pass in pointers to src, dst and height. Then the loop can become: for (; height > 0; height -= 2) ... and we can omit the &= -1 here and below. zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > Let's just pass in pointers to src, dst and height. Then the loop can become: > > for (; height > 0; height -= 2) > ... > > and we can omit the &= -1 here and below. Done.
	70 for (y = 0; y < height; y += 2) {

	71 uint16x8_t sum = vdupq_n_u16(0);

	72 uint16x8_t val16x8;

	73 uint8x8_t val8x8;

	74 const SkPMColor* sptr = src;

	75 for (x = 0; x < rightBorder; x++) {

	76 val8x8 = load2pixels<srcDirection>(sptr, srcStride);

	77 sum = vaddw_u8(sum, val8x8);

	78 sptr += srcStrideX;

	79 }

	80

	81 sptr = src;

	82 SkPMColor* dptr = dst;

	83 for (x = 0; x < width; x++) {

	84 // val = (sum * scale * 2 + 0x8000) >> 16

	85 val16x8 = vreinterpretq_u16_s16(vqrdmulhq_s16(
	Stephen White 2013/12/12 05:30:30 Please declare this variable here where it's used, Please declare this variable here where it's used, not up top. And since you'll see the type in context that way, I wouldn't bother naming it with the type. E.g., uint16x8_t result = ...; store2pixels<dstDirection>(result, ...); zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > Please declare this variable here where it's used, not up top. And since you'll > see the type in context that way, I wouldn't bother naming it with the type. > E.g., > > uint16x8_t result = ...; > store2pixels<dstDirection>(result, ...); Done.
	86 vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));

	87 store2pixels<dstDirection>(val16x8, dptr, width);

	88

	89 if (x >= leftOffset) {

	90 val8x8 = load2pixels<srcDirection>(sptr - leftOffset * srcStride X, srcStride);
	Stephen White 2013/12/12 05:30:30 Same here. Same here. zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > Same here. Done.
	91 sum = vsubw_u8(sum, val8x8);

	92 }

	93 if (x + rightOffset + 1 < width) {

	94 val8x8 = load2pixels<srcDirection>(sptr + (rightOffset + 1) * sr cStrideX, srcStride);
	Stephen White 2013/12/12 05:30:30 Same here. Same here. zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > Same here. Done.
	95 sum = vaddw_u8(sum, val8x8);

	96 }

	97 sptr += srcStrideX;

	98 dptr += dstStrideX;

	99 }

	100 src += srcStrideY * 2;

	101 dst += dstStrideY * 2;

	102 }

	103 }

	104

	105

	106 /**

23 * Helper function to spread the components of a 32-bit integer into the	107 * Helper function to spread the components of a 32-bit integer into the

24 * lower 8 bits of each 16-bit element of a NEON register.	108 * lower 8 bits of each 16-bit element of a NEON register.

25 */	109 */

26	110

27 static inline uint16x4_t expand(uint32_t a) {	111 static inline uint16x4_t expand(uint32_t a) {

28 // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B )	112 // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B )

29 uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a));	113 uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a));

30 // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B )	114 // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B )

31 return vget_low_u16(vmovl_u8(v8));	115 return vget_low_u16(vmovl_u8(v8));

32 }	116 }

33	117

34 template<BlurDirection srcDirection, BlurDirection dstDirection>	118 template<BlurDirection srcDirection, BlurDirection dstDirection>

35 void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,	119 void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,

36 int leftOffset, int rightOffset, int width, int height)	120 int leftOffset, int rightOffset, int width, int height)

37 {	121 {

38 const int rightBorder = SkMin32(rightOffset + 1, width);	122 const int rightBorder = SkMin32(rightOffset + 1, width);

39 const int srcStrideX = srcDirection == kX ? 1 : srcStride;	123 const int srcStrideX = srcDirection == kX ? 1 : srcStride;

40 const int dstStrideX = dstDirection == kX ? 1 : height;	124 const int dstStrideX = dstDirection == kX ? 1 : height;

41 const int srcStrideY = srcDirection == kX ? srcStride : 1;	125 const int srcStrideY = srcDirection == kX ? srcStride : 1;

42 const int dstStrideY = dstDirection == kX ? width : 1;	126 const int dstStrideY = dstDirection == kX ? width : 1;

43 const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);	127 const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);

44 const uint32x4_t half = vdupq_n_u32(1 << 23);	128 const uint32x4_t half = vdupq_n_u32(1 << 23);

45 for (int y = 0; y < height; ++y) {	129

	130 int fastHeight = 0;

	131 if (kernelSize < 128)
	Stephen White 2013/12/12 05:30:30 You should add a GM and a bench for the old case, You should add a GM and a bench for the old case, so that it still gets exercised (kernel size exactly 128, say). zheng.xu 2013/12/12 08:15:37 I think there is no API can set the box kernel siz Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > You should add a GM and a bench for the old case, so that it still gets > exercised (kernel size exactly 128, say). I think there is no API can set the box kernel size. box kernel size is nearly 2 times sigma. We can make a blur case with sigma 80.
	132 {

	133 SkFastBoxBlur_NEON<srcDirection, dstDirection>(src, srcStride, dst, kern elSize,

	134 leftOffset, rightOffset, width, height);
	Stephen White 2013/12/12 05:30:30 See above: if we pass in &src &dst and &height her See above: if we pass in &src &dst and &height here, we don't need to repeat the computations below. zheng.xu 2013/12/12 08:15:37 Done. Show quoted text On 2013/12/12 05:30:30, Stephen White wrote: > See above: if we pass in &src &dst and &height here, we don't need to repeat the > computations below. Done.
	135 fastHeight = height & (~1);

	136 src += srcStrideY * fastHeight;

	137 dst += dstStrideY * fastHeight;

	138 }

	139

	140 for (int y = fastHeight; y < height; ++y) {

46 uint32x4_t sum = vdupq_n_u32(0);	141 uint32x4_t sum = vdupq_n_u32(0);

47 const SkPMColor* p = src;	142 const SkPMColor* p = src;

48 for (int i = 0; i < rightBorder; ++i) {	143 for (int i = 0; i < rightBorder; ++i) {

49 sum = vaddw_u16(sum, expand(*p));	144 sum = vaddw_u16(sum, expand(*p));

50 p += srcStrideX;	145 p += srcStrideX;

51 }	146 }

52	147

53 const SkPMColor* sptr = src;	148 const SkPMColor* sptr = src;

54 SkPMColor* dptr = dst;	149 SkPMColor* dptr = dst;

55 for (int x = 0; x < width; ++x) {	150 for (int x = 0; x < width; ++x) {

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
92 bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX,	187 bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX,

93 SkBoxBlurProc* boxBlurY,	188 SkBoxBlurProc* boxBlurY,

94 SkBoxBlurProc* boxBlurXY,	189 SkBoxBlurProc* boxBlurXY,

95 SkBoxBlurProc* boxBlurYX) {	190 SkBoxBlurProc* boxBlurYX) {

96 *boxBlurX = SkBoxBlur_NEON<kX, kX>;	191 *boxBlurX = SkBoxBlur_NEON<kX, kX>;

97 *boxBlurY = SkBoxBlur_NEON<kY, kY>;	192 *boxBlurY = SkBoxBlur_NEON<kY, kY>;

98 *boxBlurXY = SkBoxBlur_NEON<kX, kY>;	193 *boxBlurXY = SkBoxBlur_NEON<kX, kY>;

99 *boxBlurYX = SkBoxBlur_NEON<kY, kX>;	194 *boxBlurYX = SkBoxBlur_NEON<kY, kX>;

100 return true;	195 return true;

101 }	196 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »