src/opts/SkBlurImage_opts_SSE4.cpp - Issue 1123263003: Really use SSE4 (and SSSE3) in SkBlurImage_SSE4

Side by Side Diff: src/opts/SkBlurImage_opts_SSE4.cpp

Issue 1123263003: Really use SSE4 (and SSSE3) in SkBlurImage_SSE4 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: 0 Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The Android Open Source Project	2 * Copyright 2014 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBitmap.h"	8 #include "SkBitmap.h"

9 #include "SkBlurImage_opts_SSE4.h"	9 #include "SkBlurImage_opts_SSE4.h"

10 #include "SkColorPriv.h"	10 #include "SkColorPriv.h"

11 #include "SkRect.h"	11 #include "SkRect.h"

12	12

13 /* With the exception of the compilers that don't support it, we always build th e	13 /* With the exception of the compilers that don't support it, we always build th e

14 * SSE4 functions and enable the caller to determine SSE4 support. However for	14 * SSE4 functions and enable the caller to determine SSE4 support. However for

15 * compilers that do not support SSE4x we provide a stub implementation.	15 * compilers that do not support SSE4x we provide a stub implementation.

16 */	16 */

17 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41	17 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

18	18

19 #include <smmintrin.h>	19 #include <smmintrin.h>

20	20

21 namespace {	21 namespace {

22 enum BlurDirection {	22 enum BlurDirection {

23 kX, kY	23 kX, kY

24 };	24 };

25	25

26 /* Helper function to spread the components of a 32-bit integer into the	26 /* Helper function to spread the components of a 32-bit integer into the

27 * lower 8 bits of each 32-bit element of an SSE register.	27 * lower 8 bits of each 32-bit element of an SSE register.

28 */	28 */

29 inline __m128i expand(int a) {	29 inline __m128i expand(int a) {

30 const __m128i zero = _mm_setzero_si128();	30 // ARGB -> 0000 0000 0000 ARGB

31	31 __m128i widened = _mm_cvtsi32_si128(a);

32 // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B	32 // SSE4.1 has xxxx xxxx xxxx ARGB -> 000A 000R 000G 000B as a one-stop-shop instruction.

33 __m128i result = _mm_cvtsi32_si128(a);	33 // It can even work from memory, so a smart compiler probably merges in the _mm_cvtsi32_si128().

34	34 return _mm_cvtepu8_epi32(widened);

35 // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B

36 result = _mm_unpacklo_epi8(result, zero);

37

38 // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B

39 return _mm_unpacklo_epi16(result, zero);

40 }	35 }

41	36

42 template<BlurDirection srcDirection, BlurDirection dstDirection>	37 template<BlurDirection srcDirection, BlurDirection dstDirection>

43 void SkBoxBlur_SSE4(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,	38 void SkBoxBlur_SSE4(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,

44 int leftOffset, int rightOffset, int width, int height)	39 int leftOffset, int rightOffset, int width, int height)

45 {	40 {

46 const int rightBorder = SkMin32(rightOffset + 1, width);	41 const int rightBorder = SkMin32(rightOffset + 1, width);

47 const int srcStrideX = srcDirection == kX ? 1 : srcStride;	42 const int srcStrideX = srcDirection == kX ? 1 : srcStride;

48 const int dstStrideX = dstDirection == kX ? 1 : height;	43 const int dstStrideX = dstDirection == kX ? 1 : height;

49 const int srcStrideY = srcDirection == kX ? srcStride : 1;	44 const int srcStrideY = srcDirection == kX ? srcStride : 1;

50 const int dstStrideY = dstDirection == kX ? width : 1;	45 const int dstStrideY = dstDirection == kX ? width : 1;

51 const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);	46 const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);

52 const __m128i half = _mm_set1_epi32(1 << 23);	47 const __m128i half = _mm_set1_epi32(1 << 23);

53 const __m128i zero = _mm_setzero_si128();

54 for (int y = 0; y < height; ++y) {	48 for (int y = 0; y < height; ++y) {

55 __m128i sum = zero;	49 __m128i sum = _mm_setzero_si128();

56 const SkPMColor* p = src;	50 const SkPMColor* p = src;

57 for (int i = 0; i < rightBorder; ++i) {	51 for (int i = 0; i < rightBorder; ++i) {

58 sum = _mm_add_epi32(sum, expand(*p));	52 sum = _mm_add_epi32(sum, expand(*p));

59 p += srcStrideX;	53 p += srcStrideX;

60 }	54 }

61	55

62 const SkPMColor* sptr = src;	56 const SkPMColor* sptr = src;

63 SkColor* dptr = dst;	57 SkColor* dptr = dst;

64 for (int x = 0; x < width; ++x) {	58 for (int x = 0; x < width; ++x) {

	59 // TODO(mtklein): We are working in 8.24 here. Drop to 8.8 when the kernel is narrow?

	60

	61 // Multiply each component by scale (i.e. divide by kernel size) and add half to round.

65 __m128i result = _mm_mullo_epi32(sum, scale);	62 __m128i result = _mm_mullo_epi32(sum, scale);

66

67 // sumAscale+.5 sumBscale+.5 sumGscale+.5 sumBscale+.5

68 result = _mm_add_epi32(result, half);	63 result = _mm_add_epi32(result, half);

69	64

70 // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B	65 // Now pack the top byte of each 32-bit lane back down into one 32-b it color.

71 result = _mm_srli_epi32(result, 24);	66 // Axxx Rxxx Gxxx Bxxx -> xxxx xxxx xxxx ARGB

	67 const char _ = 0; // Don't care what ends up in these bytes. Happe ns to be byte 0.

	68 result = _mm_shuffle_epi8(result, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_ ,_,_, 15,11,7,3));

72	69

73 // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B	70 *dptr = _mm_cvtsi128_si32(result);

74 result = _mm_packs_epi32(result, zero);

75	71

76 // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B	72 // TODO(mtklein): experiment with breaking this loop into 3 parts

77 result = _mm_packus_epi16(result, zero);

78 *dptr = _mm_cvtsi128_si32(result);

79 if (x >= leftOffset) {	73 if (x >= leftOffset) {

80 SkColor l = (sptr - leftOffset srcStrideX);	74 SkColor l = (sptr - leftOffset srcStrideX);

81 sum = _mm_sub_epi32(sum, expand(l));	75 sum = _mm_sub_epi32(sum, expand(l));

82 }	76 }

83 if (x + rightOffset + 1 < width) {	77 if (x + rightOffset + 1 < width) {

84 SkColor r = (sptr + (rightOffset + 1) srcStrideX);	78 SkColor r = (sptr + (rightOffset + 1) srcStrideX);

85 sum = _mm_add_epi32(sum, expand(r));	79 sum = _mm_add_epi32(sum, expand(r));

86 }	80 }

87 sptr += srcStrideX;	81 sptr += srcStrideX;

88 if (srcDirection == kY) {	82 if (srcDirection == kY) {

	83 // TODO(mtklein): experiment with moving this prefetch forward

89 _mm_prefetch(reinterpret_cast<const char>(sptr + (rightOffset + 1) srcStrideX),	84 _mm_prefetch(reinterpret_cast<const char>(sptr + (rightOffset + 1) srcStrideX),

90 _MM_HINT_T0);	85 _MM_HINT_T0);

91 }	86 }

92 dptr += dstStrideX;	87 dptr += dstStrideX;

93 }	88 }

94 src += srcStrideY;	89 src += srcStrideY;

95 dst += dstStrideY;	90 dst += dstStrideY;

96 }	91 }

97 }	92 }

98	93

(...skipping 15 matching lines...) Expand all Loading...
114 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX,	109 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX,

115 SkBoxBlurProc* boxBlurY,	110 SkBoxBlurProc* boxBlurY,

116 SkBoxBlurProc* boxBlurXY,	111 SkBoxBlurProc* boxBlurXY,

117 SkBoxBlurProc* boxBlurYX) {	112 SkBoxBlurProc* boxBlurYX) {

118 sk_throw();	113 sk_throw();

119 return false;	114 return false;

120 }	115 }

121	116

122	117

123 #endif	118 #endif

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »