src/opts/SkBlurImageFilter_opts.h - Issue 1408003007: Refactor SkBlurImageFilter_Opts.h.

Side by Side Diff: src/opts/SkBlurImageFilter_opts.h

Issue 1408003007: Refactor SkBlurImageFilter_Opts.h. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Changes per review Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkBlurImageFilter_opts_DEFINED	8 #ifndef SkBlurImageFilter_opts_DEFINED

9 #define SkBlurImageFilter_opts_DEFINED	9 #define SkBlurImageFilter_opts_DEFINED

10	10

11 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

12 #include "SkTypes.h"	12 #include "SkTypes.h"

13	13

14 namespace SK_OPTS_NS {	14 namespace SK_OPTS_NS {

15	15

16 enum class BlurDirection { kX, kY };	16 enum class BlurDirection { kX, kY };

17	17

18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

19 template<BlurDirection srcDirection, BlurDirection dstDirection>

20 void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSiz e,

21 int leftOffset, int rightOffset, int width, int height) {

22 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41	19 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

23 // ARGB -> 000A 000R 000G 000B	20 // ARGB -> 000A 000R 000G 000B

24 auto expand = [](int p) {	21 static inline __m128i expand(SkPMColor p) {

25 return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));	22 return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));

26 };	23 };

27 // Axxx Rxxx Gxxx Bxxx -> ARGB	24 // Axxx Rxxx Gxxx Bxxx -> ARGB

28 auto repack = [](__m128i p) {	25 static inline SkPMColor repack(__m128i p) {

29 const char _ = ~0; // Don't care what ends up in these bytes. This zer os them.	26 const char _ = ~0; // Don't care what ends up in these bytes. This zeros t hem.

30 p = _mm_shuffle_epi8(p, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 15,11,7, 3));	27 p = _mm_shuffle_epi8(p, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 15,11,7,3));

31 return _mm_cvtsi128_si32(p);	28 return _mm_cvtsi128_si32(p);

32 };	29 };

33	30

34 #else	31 #else

35 // ARGB -> 000A 000R 000G 000B	32 // ARGB -> 000A 000R 000G 000B

36 auto expand = [](int p) {	33 static inline __m128i expand(int p) {

37 auto result = _mm_cvtsi32_si128(p);	34 auto result = _mm_cvtsi32_si128(p);

38 result = _mm_unpacklo_epi8(result, _mm_setzero_si128());	35 result = _mm_unpacklo_epi8(result, _mm_setzero_si128());

39 result = _mm_unpacklo_epi16(result, _mm_setzero_si128());	36 result = _mm_unpacklo_epi16(result, _mm_setzero_si128());

40 return result;	37 return result;

41 };	38 };

42 // Axxx Rxxx Gxxx Bxxx -> ARGB	39 // Axxx Rxxx Gxxx Bxxx -> ARGB

43 auto repack = [](__m128i p) {	40 static inline SkPMColor repack(__m128i p) {

44 p = _mm_srli_epi32(p, 24); // 000A 000R 000G 000B	41 p = _mm_srli_epi32(p, 24); // 000A 000R 000G 000B

45 p = _mm_packs_epi32(p, p); // xxxx xxxx 0A0R 0G0B	42 p = _mm_packs_epi32(p, p); // xxxx xxxx 0A0R 0G0B

46 p = _mm_packus_epi16(p, p); // xxxx xxxx xxxx ARGB	43 p = _mm_packus_epi16(p, p); // xxxx xxxx xxxx ARGB

47 return _mm_cvtsi128_si32(p);	44 return _mm_cvtsi128_si32(p);

48 };	45 };

49	46

50 // _mm_mullo_epi32 is not available, so use the standard trick to emulate it .	47 // _mm_mullo_epi32 is not available, so use the standard trick to emulate it.

51 auto _mm_mullo_epi32 = [](__m128i a, __m128i b) {	48 static inline __m128i _mm_mullo_epi32(__m128i a, __m128i b) {
	mtklein 2015/10/27 20:24:34 Annoyingly MSVC makes all the intrinsics visible. Annoyingly MSVC makes all the intrinsics visible. It's hard to hold that against them, as it means you can use them all without any particular compile-time flags, which is nice. Anyway seems fine to just rename this, e.g. `mm_mullo_epi32_shim`. Stephen White 2015/10/27 21:12:16 Done. Show quoted text On 2015/10/27 20:24:34, mtklein wrote: > Annoyingly MSVC makes all the intrinsics visible. It's hard to hold that > against them, as it means you can use them all without any particular > compile-time flags, which is nice. > > Anyway seems fine to just rename this, e.g. `mm_mullo_epi32_shim`. Done.
52 __m128i p02 = _mm_mul_epu32(a, b),	49 __m128i p02 = _mm_mul_epu32(a, b),

53 p13 = _mm_mul_epu32(_mm_srli_si128(a, 4),	50 p13 = _mm_mul_epu32(_mm_srli_si128(a, 4),

54 _mm_srli_si128(b, 4));	51 _mm_srli_si128(b, 4));

55 return _mm_unpacklo_epi32(_mm_shuffle_epi32(p02, _MM_SHUFFLE(0,0,2,0)),	52 return _mm_unpacklo_epi32(_mm_shuffle_epi32(p02, _MM_SHUFFLE(0,0,2,0)),

56 _mm_shuffle_epi32(p13, _MM_SHUFFLE(0,0,2,0)));	53 _mm_shuffle_epi32(p13, _MM_SHUFFLE(0,0,2,0)));

57 };	54 };

58 #endif	55 #endif

59 const int rightBorder = SkMin32(rightOffset + 1, width);	56 #define INIT_SCALE const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);

60 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;	57 #define INIT_HALF const __m128i half = _mm_set1_epi32(1 << 23);

61 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;	58 #define INIT_SUMS __m128i sum = _mm_setzero_si128();

62 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;	59 #define INCREMENT_SUMS(c) sum = _mm_add_epi32(sum, expand(c))

63 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;	60 #define DECREMENT_SUMS(c) sum = _mm_sub_epi32(sum, expand(c))

64 const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);	61 #define STORE_SUMS \

65 const __m128i half = _mm_set1_epi32(1 << 23);	62 auto result = _mm_mullo_epi32(sum, scale); \

66 for (int y = 0; y < height; ++y) {	63 result = _mm_add_epi32(result, half); \

67 __m128i sum = _mm_setzero_si128();	64 *dptr = repack(result);

68 const SkPMColor* p = src;	65 // TODO(mtklein): make SK_PREFETCH work with MSVC, use that directly. ?
	mtklein 2015/10/27 20:24:34 should be good to go now should be good to go now Stephen White 2015/10/27 21:12:16 Fixed. Show quoted text On 2015/10/27 20:24:34, mtklein wrote: > should be good to go now Fixed. mtklein 2015/10/27 21:15:12 Oh boo, not compiling on Windows. I guess this is Show quoted text On 2015/10/27 21:12:16, Stephen White wrote: > On 2015/10/27 20:24:34, mtklein wrote: > > should be good to go now > > Fixed. Oh boo, not compiling on Windows. I guess this is the only place we use SK_PREFETCH...
69 for (int i = 0; i < rightBorder; ++i) {	66 #define PREFETCH(p) _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0)

70 sum = _mm_add_epi32(sum, expand(*p));	67 #define DOUBLE_ROW_OPTIMIZATION

71 p += srcStrideX;

72 }

73

74 const SkPMColor* sptr = src;

75 SkColor* dptr = dst;

76 for (int x = 0; x < width; ++x) {

77 // TODO(mtklein): We are working in 8.24 here. Drop to 8.8 when the kernel is narrow?

78 // Multiply each component by scale (divide by kernel size) and add half to round.

79 auto result = _mm_mullo_epi32(sum, scale);

80 result = _mm_add_epi32(result, half);

81

82 // Now pack the top byte of each 32-bit lane back down into one 32-b it color.

83 // Axxx Rxxx Gxxx Bxxx -> xxxx xxxx xxxx ARGB

84 *dptr = repack(result);

85

86 // TODO(mtklein): experiment with breaking this loop into 3 parts

87 if (x >= leftOffset) {

88 SkColor l = (sptr - leftOffset srcStrideX);

89 sum = _mm_sub_epi32(sum, expand(l));

90 }

91 if (x + rightOffset + 1 < width) {

92 SkColor r = (sptr + (rightOffset + 1) srcStrideX);

93 sum = _mm_add_epi32(sum, expand(r));

94 }

95 sptr += srcStrideX;

96 if (srcDirection == BlurDirection::kY) {

97 // TODO(mtklein): experiment with moving this prefetch forward

98 _mm_prefetch(reinterpret_cast<const char>(sptr + (rightOffset + 1) srcStrideX),

99 _MM_HINT_T0);

100 }

101 dptr += dstStrideX;

102 }

103 src += srcStrideY;

104 dst += dstStrideY;

105 }

106 }

107	68

108 #elif defined(SK_ARM_HAS_NEON)	69 #elif defined(SK_ARM_HAS_NEON)

109	70

110 // Fast path for kernel sizes between 2 and 127, working on two rows at a time.	71 // Fast path for kernel sizes between 2 and 127, working on two rows at a time.

111 template<BlurDirection srcDirection, BlurDirection dstDirection>	72 template<BlurDirection srcDirection, BlurDirection dstDirection>

112 void box_blur_double(const SkPMColor src, int srcStride, SkPMColor dst, int kernelSize,	73 void box_blur_double(const SkPMColor src, int srcStride, SkPMColor dst, int kernelSize,

113 int leftOffset, int rightOffset, int width, int* height) {	74 int leftOffset, int rightOffset, int width, int* height) {

114 // Load 2 pixels from adjacent rows.	75 // Load 2 pixels from adjacent rows.

115 auto load_2_pixels = [&](const SkPMColor* s) {	76 auto load_2_pixels = [&](const SkPMColor* s) {

116 if (srcDirection == BlurDirection::kX) {	77 if (srcDirection == BlurDirection::kX) {

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
160 sum = vaddw_u8(sum, load_2_pixels(sptr + (rightOffset + 1) * src StrideX));	121 sum = vaddw_u8(sum, load_2_pixels(sptr + (rightOffset + 1) * src StrideX));

161 }	122 }

162 sptr += srcStrideX;	123 sptr += srcStrideX;

163 dptr += dstStrideX;	124 dptr += dstStrideX;

164 }	125 }

165 src += srcStrideY 2;	126 src += srcStrideY 2;

166 dst += dstStrideY 2;	127 dst += dstStrideY 2;

167 }	128 }

168 }	129 }

169	130

170 template<BlurDirection srcDirection, BlurDirection dstDirection>	131 // ARGB -> 0A0R 0G0B

171 void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSiz e,	132 static inline uint16x4_t expand(SkPMColor p) {

172 int leftOffset, int rightOffset, int width, int height) {	133 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p))));

173 // ARGB -> 0A0R 0G0B	134 };

174 auto expand = [](uint32_t p) {

175 return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p))));

176 };

177 const int rightBorder = SkMin32(rightOffset + 1, width);

178 const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;

179 const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;

180 const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;

181 const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;

182 const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);

183 const uint32x4_t half = vdupq_n_u32(1 << 23);

184	135

185 if (1 < kernelSize && kernelSize < 128) {	136 #define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);

186 box_blur_double<srcDirection, dstDirection>(&src, srcStride, &dst, kerne lSize,	137 #define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23);

187 leftOffset, rightOffset, wid th, &height);	138 #define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0);

	139 #define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c));

	140 #define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c));

	141

	142 #define STORE_SUMS \

	143 uint32x4_t result = vmlaq_u32(half, sum, scale); \

	144 uint16x4_t result16 = vqshrn_n_u32(result, 16); \

	145 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \

	146 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);

	147

	148 #define PREFETCH(p) SK_PREFETCH(p);

	149 #define DOUBLE_ROW_OPTIMIZATION \

	150 if (1 < kernelSize && kernelSize < 128) { \

	151 box_blur_double<srcDirection, dstDirection>(&src, srcStride, &dst, kerne lSize, \

	152 leftOffset, rightOffset, wid th, &height); \

188 }	153 }

189	154

190 for (; height > 0; height--) {

191 uint32x4_t sum = vdupq_n_u32(0);

192 const SkPMColor* p = src;

193 for (int i = 0; i < rightBorder; ++i) {

194 sum = vaddw_u16(sum, expand(*p));

195 p += srcStrideX;

196 }

197

198 const SkPMColor* sptr = src;

199 SkPMColor* dptr = dst;

200 for (int x = 0; x < width; ++x) {

201 // ( half+sumAscale half+sumRscale half+sumGscale half+sumBscale )

202 uint32x4_t result = vmlaq_u32(half, sum, scale);

203

204 // Saturated conversion to 16-bit.

205 // ( AAAA RRRR GGGG BBBB ) -> ( 0A 0R 0G 0B )

206 uint16x4_t result16 = vqshrn_n_u32(result, 16);

207

208 // Saturated conversion to 8-bit.

209 // ( 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( A R G B A R G B )

210 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8 );

211

212 // ( A R G B A R G B ) -> ( ARGB ARGB ) -> ( ARGB )

213 // Store low 32 bits to destination.

214 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);

215

216 if (x >= leftOffset) {

217 const SkPMColor* l = sptr - leftOffset * srcStrideX;

218 sum = vsubw_u16(sum, expand(*l));

219 }

220 if (x + rightOffset + 1 < width) {

221 const SkPMColor* r = sptr + (rightOffset + 1) * srcStrideX;

222 sum = vaddw_u16(sum, expand(*r));

223 }

224 sptr += srcStrideX;

225 if (srcDirection == BlurDirection::kX) {

226 SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);

227 }

228 dptr += dstStrideX;

229 }

230 src += srcStrideY;

231 dst += dstStrideY;

232 }

233 }

234

235 #else // Neither NEON nor >=SSE2.	155 #else // Neither NEON nor >=SSE2.

236	156

	157 #define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize;

	158 #define INIT_HALF uint32_t half = 1 << 23;

	159 #define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0;

	160 #define INCREMENT_SUMS(c) \

	161 sumA += SkGetPackedA32(c); \

	162 sumR += SkGetPackedR32(c); \

	163 sumG += SkGetPackedG32(c); \

	164 sumB += SkGetPackedB32(c)

	165 #define DECREMENT_SUMS(c) \

	166 sumA -= SkGetPackedA32(c); \

	167 sumR -= SkGetPackedR32(c); \

	168 sumG -= SkGetPackedG32(c); \

	169 sumB -= SkGetPackedB32(c)

	170 #define STORE_SUMS \

	171 dptr = SkPackARGB32((sumA scale + half) >> 24, \

	172 (sumR * scale + half) >> 24, \

	173 (sumG * scale + half) >> 24, \

	174 (sumB * scale + half) >> 24);

	175 #define PREFETCH SK_PREFETCH

	176 #define DOUBLE_ROW_OPTIMIZATION

	177

	178 #endif

	179

237 template<BlurDirection srcDirection, BlurDirection dstDirection>	180 template<BlurDirection srcDirection, BlurDirection dstDirection>

238 static void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke rnelSize,	181 static void box_blur(const SkPMColor* src, int srcStride, SkPMColor* dst, int ke rnelSize,

239 int leftOffset, int rightOffset, int width, int height) {	182 int leftOffset, int rightOffset, int width, int height) {

240 int rightBorder = SkMin32(rightOffset + 1, width);	183 int rightBorder = SkMin32(rightOffset + 1, width);

241 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;	184 int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;

242 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;	185 int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;

243 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;	186 int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;

244 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;	187 int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;

245 uint32_t scale = (1 << 24) / kernelSize;	188 INIT_SCALE

246 uint32_t half = 1 << 23;	189 INIT_HALF

	190

	191 DOUBLE_ROW_OPTIMIZATION

	192

247 for (int y = 0; y < height; ++y) {	193 for (int y = 0; y < height; ++y) {

248 int sumA = 0, sumR = 0, sumG = 0, sumB = 0;	194 INIT_SUMS

249 const SkPMColor* p = src;	195 const SkPMColor* p = src;

250 for (int i = 0; i < rightBorder; ++i) {	196 for (int i = 0; i < rightBorder; ++i) {

251 sumA += SkGetPackedA32(*p);	197 INCREMENT_SUMS(*p);

252 sumR += SkGetPackedR32(*p);

253 sumG += SkGetPackedG32(*p);

254 sumB += SkGetPackedB32(*p);

255 p += srcStrideX;	198 p += srcStrideX;

256 }	199 }

257	200

258 const SkPMColor* sptr = src;	201 const SkPMColor* sptr = src;

259 SkColor* dptr = dst;	202 SkColor* dptr = dst;

260 for (int x = 0; x < width; ++x) {	203 for (int x = 0; x < width; ++x) {

261 dptr = SkPackARGB32((sumA scale + half) >> 24,	204 STORE_SUMS

262 (sumR * scale + half) >> 24,

263 (sumG * scale + half) >> 24,

264 (sumB * scale + half) >> 24);

265 if (x >= leftOffset) {	205 if (x >= leftOffset) {

266 SkColor l = (sptr - leftOffset srcStrideX);	206 SkColor l = (sptr - leftOffset srcStrideX);

267 sumA -= SkGetPackedA32(l);	207 DECREMENT_SUMS(l);

268 sumR -= SkGetPackedR32(l);

269 sumG -= SkGetPackedG32(l);

270 sumB -= SkGetPackedB32(l);

271 }	208 }

272 if (x + rightOffset + 1 < width) {	209 if (x + rightOffset + 1 < width) {

273 SkColor r = (sptr + (rightOffset + 1) srcStrideX);	210 SkColor r = (sptr + (rightOffset + 1) srcStrideX);

274 sumA += SkGetPackedA32(r);	211 INCREMENT_SUMS(r);

275 sumR += SkGetPackedR32(r);

276 sumG += SkGetPackedG32(r);

277 sumB += SkGetPackedB32(r);

278 }	212 }

279 sptr += srcStrideX;	213 sptr += srcStrideX;

280 if (srcDirection == BlurDirection::kY) {	214 if (srcDirection == BlurDirection::kY) {

281 SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);	215 PREFETCH(sptr + (rightOffset + 1) * srcStrideX);

282 }	216 }

283 dptr += dstStrideX;	217 dptr += dstStrideX;

284 }	218 }

285 src += srcStrideY;	219 src += srcStrideY;

286 dst += dstStrideY;	220 dst += dstStrideY;

287 }	221 }

288 }	222 }

289	223

290 #endif

291

292 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>,	224 static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>,

293 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>,	225 box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>,

294 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>;	226 box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>;

295	227

296 } // namespace SK_OPTS_NS	228 } // namespace SK_OPTS_NS

297	229

298 #endif	230 #endif

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »