src/core/Sk4px.h - Issue 1245673002: 565 support for SIMD xfermodes

Side by Side Diff: src/core/Sk4px.h

Issue 1245673002: 565 support for SIMD xfermodes (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: fix typo Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef Sk4px_DEFINED	8 #ifndef Sk4px_DEFINED

9 #define Sk4px_DEFINED	9 #define Sk4px_DEFINED

10	10

11 #include "SkNx.h"	11 #include "SkNx.h"

12 #include "SkColor.h"	12 #include "SkColor.h"

	13 #include "SkColorPriv.h"

13	14

14 // This file may be included multiple times by .cpp files with different flags, leading	15 // This file may be included multiple times by .cpp files with different flags, leading

15 // to different definitions. Usually that doesn't matter because it's all inlin ed, but	16 // to different definitions. Usually that doesn't matter because it's all inlin ed, but

16 // in Debug modes the compilers may not inline everything. So wrap everything i n an	17 // in Debug modes the compilers may not inline everything. So wrap everything i n an

17 // anonymous namespace to give each includer their own silo of this code (or the linker	18 // anonymous namespace to give each includer their own silo of this code (or the linker

18 // will probably pick one randomly for us, which is rarely correct).	19 // will probably pick one randomly for us, which is rarely correct).

19 namespace {	20 namespace {

20	21

21 // 1, 2 or 4 SkPMColors, generally vectorized.	22 // 1, 2 or 4 SkPMColors, generally vectorized.

22 class Sk4px : public Sk16b {	23 class Sk4px : public Sk16b {

(...skipping 17 matching lines...) Expand all Loading...
40 static Sk4px Load1(const SkPMColor[1]); // PMColor[1] -> ARGB ???? ???? ??? ?	41 static Sk4px Load1(const SkPMColor[1]); // PMColor[1] -> ARGB ???? ???? ??? ?

41	42

42 // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.	43 // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.

43 static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx	44 static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx

44 static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa ???? ????	45 static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa ???? ????

45	46

46 void store4(SkPMColor[4]) const;	47 void store4(SkPMColor[4]) const;

47 void store2(SkPMColor[2]) const;	48 void store2(SkPMColor[2]) const;

48 void store1(SkPMColor[1]) const;	49 void store1(SkPMColor[1]) const;

49	50

	51 // Same as above for 565.

	52 static Sk4px Load4(const SkPMColor16 src[4]);

	53 static Sk4px Load2(const SkPMColor16 src[2]);

	54 static Sk4px Load1(const SkPMColor16 src[1]);

	55 void store4(SkPMColor16 dst[4]) const;

	56 void store2(SkPMColor16 dst[2]) const;

	57 void store1(SkPMColor16 dst[1]) const;

	58

50 // 1, 2, or 4 SkPMColors with 16-bit components.	59 // 1, 2, or 4 SkPMColors with 16-bit components.

51 // This is most useful as the result of a multiply, e.g. from mulWiden().	60 // This is most useful as the result of a multiply, e.g. from mulWiden().

52 class Wide : public Sk16h {	61 class Wide : public Sk16h {

53 public:	62 public:

54 Wide(const Sk16h& v) : Sk16h(v) {}	63 Wide(const Sk16h& v) : Sk16h(v) {}

55	64

56 // Pack the top byte of each component back down into 4 SkPMColors.	65 // Pack the top byte of each component back down into 4 SkPMColors.

57 Sk4px addNarrowHi(const Sk16h&) const;	66 Sk4px addNarrowHi(const Sk16h&) const;

58	67

59 // Rounds, i.e. (x+127) / 255.	68 // Rounds, i.e. (x+127) / 255.

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
92	101

93 // Generally faster than (this o).div255().	102 // Generally faster than (this o).div255().

94 // May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255.	103 // May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255.

95 Sk4px approxMulDiv255(const Sk16b& o) const {	104 Sk4px approxMulDiv255(const Sk16b& o) const {

96 // (xy + x) / 256 meets these criteria. (As of course does (xy + y) / 256 by symmetry.)	105 // (xy + x) / 256 meets these criteria. (As of course does (xy + y) / 256 by symmetry.)

97 return this->widenLo().addNarrowHi(this o);	106 return this->widenLo().addNarrowHi(this o);

98 }	107 }

99	108

100 // A generic driver that maps fn over a src array into a dst array.	109 // A generic driver that maps fn over a src array into a dst array.

101 // fn should take an Sk4px (4 src pixels) and return an Sk4px (4 dst pixels) .	110 // fn should take an Sk4px (4 src pixels) and return an Sk4px (4 dst pixels) .

102 template <typename Fn>	111 template <typename Fn, typename Dst>

103 static void MapSrc(int n, SkPMColor* dst, const SkPMColor* src, const Fn& fn ) {	112 static void MapSrc(int n, Dst* dst, const SkPMColor* src, const Fn& fn) {

104 // This looks a bit odd, but it helps loop-invariant hoisting across dif ferent calls to fn.	113 // This looks a bit odd, but it helps loop-invariant hoisting across dif ferent calls to fn.

105 // Basically, we need to make sure we keep things inside a single loop.	114 // Basically, we need to make sure we keep things inside a single loop.

106 while (n > 0) {	115 while (n > 0) {

107 if (n >= 8) {	116 if (n >= 8) {

108 Sk4px dst0 = fn(Load4(src+0)),	117 Sk4px dst0 = fn(Load4(src+0)),

109 dst4 = fn(Load4(src+4));	118 dst4 = fn(Load4(src+4));

110 dst0.store4(dst+0);	119 dst0.store4(dst+0);

111 dst4.store4(dst+4);	120 dst4.store4(dst+4);

112 dst += 8; src += 8; n -= 8;	121 dst += 8; src += 8; n -= 8;

113 continue; // Keep our stride at 8 pixels as long as possible.	122 continue; // Keep our stride at 8 pixels as long as possible.

114 }	123 }

115 SkASSERT(n <= 7);	124 SkASSERT(n <= 7);

116 if (n >= 4) {	125 if (n >= 4) {

117 fn(Load4(src)).store4(dst);	126 fn(Load4(src)).store4(dst);

118 dst += 4; src += 4; n -= 4;	127 dst += 4; src += 4; n -= 4;

119 }	128 }

120 if (n >= 2) {	129 if (n >= 2) {

121 fn(Load2(src)).store2(dst);	130 fn(Load2(src)).store2(dst);

122 dst += 2; src += 2; n -= 2;	131 dst += 2; src += 2; n -= 2;

123 }	132 }

124 if (n >= 1) {	133 if (n >= 1) {

125 fn(Load1(src)).store1(dst);	134 fn(Load1(src)).store1(dst);

126 }	135 }

127 break;	136 break;

128 }	137 }

129 }	138 }

130	139

131 // As above, but with dst4' = fn(dst4, src4).	140 // As above, but with dst4' = fn(dst4, src4).

132 template <typename Fn>	141 template <typename Fn, typename Dst>

133 static void MapDstSrc(int n, SkPMColor* dst, const SkPMColor* src, const Fn& fn) {	142 static void MapDstSrc(int n, Dst* dst, const SkPMColor* src, const Fn& fn) {

134 while (n > 0) {	143 while (n > 0) {

135 if (n >= 8) {	144 if (n >= 8) {

136 Sk4px dst0 = fn(Load4(dst+0), Load4(src+0)),	145 Sk4px dst0 = fn(Load4(dst+0), Load4(src+0)),

137 dst4 = fn(Load4(dst+4), Load4(src+4));	146 dst4 = fn(Load4(dst+4), Load4(src+4));

138 dst0.store4(dst+0);	147 dst0.store4(dst+0);

139 dst4.store4(dst+4);	148 dst4.store4(dst+4);

140 dst += 8; src += 8; n -= 8;	149 dst += 8; src += 8; n -= 8;

141 continue; // Keep our stride at 8 pixels as long as possible.	150 continue; // Keep our stride at 8 pixels as long as possible.

142 }	151 }

143 SkASSERT(n <= 7);	152 SkASSERT(n <= 7);

144 if (n >= 4) {	153 if (n >= 4) {

145 fn(Load4(dst), Load4(src)).store4(dst);	154 fn(Load4(dst), Load4(src)).store4(dst);

146 dst += 4; src += 4; n -= 4;	155 dst += 4; src += 4; n -= 4;

147 }	156 }

148 if (n >= 2) {	157 if (n >= 2) {

149 fn(Load2(dst), Load2(src)).store2(dst);	158 fn(Load2(dst), Load2(src)).store2(dst);

150 dst += 2; src += 2; n -= 2;	159 dst += 2; src += 2; n -= 2;

151 }	160 }

152 if (n >= 1) {	161 if (n >= 1) {

153 fn(Load1(dst), Load1(src)).store1(dst);	162 fn(Load1(dst), Load1(src)).store1(dst);

154 }	163 }

155 break;	164 break;

156 }	165 }

157 }	166 }

158	167

159 // As above, but with dst4' = fn(dst4, src4, alpha4).	168 // As above, but with dst4' = fn(dst4, src4, alpha4).

160 template <typename Fn>	169 template <typename Fn, typename Dst>

161 static void MapDstSrcAlpha(int n, SkPMColor* dst, const SkPMColor* src, cons t SkAlpha* a,	170 static void MapDstSrcAlpha(int n, Dst* dst, const SkPMColor* src, const SkAl pha* a,

162 const Fn& fn) {	171 const Fn& fn) {

163 while (n > 0) {	172 while (n > 0) {

164 if (n >= 8) {	173 if (n >= 8) {

165 Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), Load4Alphas(a+0)),	174 Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), Load4Alphas(a+0)),

166 dst4 = fn(Load4(dst+4), Load4(src+4), Load4Alphas(a+4));	175 dst4 = fn(Load4(dst+4), Load4(src+4), Load4Alphas(a+4));

167 dst0.store4(dst+0);	176 dst0.store4(dst+0);

168 dst4.store4(dst+4);	177 dst4.store4(dst+4);

169 dst += 8; src += 8; a += 8; n -= 8;	178 dst += 8; src += 8; a += 8; n -= 8;

170 continue; // Keep our stride at 8 pixels as long as possible.	179 continue; // Keep our stride at 8 pixels as long as possible.

171 }	180 }

(...skipping 25 matching lines...) Expand all Loading...
197 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	206 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

198 #include "../opts/Sk4px_SSE2.h"	207 #include "../opts/Sk4px_SSE2.h"

199 #elif defined(SK_ARM_HAS_NEON)	208 #elif defined(SK_ARM_HAS_NEON)

200 #include "../opts/Sk4px_NEON.h"	209 #include "../opts/Sk4px_NEON.h"

201 #else	210 #else

202 #include "../opts/Sk4px_none.h"	211 #include "../opts/Sk4px_none.h"

203 #endif	212 #endif

204 #endif	213 #endif

205	214

206 #endif//Sk4px_DEFINED	215 #endif//Sk4px_DEFINED

OLD	NEW

« no previous file with comments | « no previous file | src/core/Sk4pxXfermode.h » ('j') | no next file with comments »