OLD | NEW |
---|---|
1 #include "SkColorPriv.h" | 1 #include "SkColorPriv.h" |
2 #include "SkColor_opts_SSE2.h" | 2 #include "SkColor_opts_SSE2.h" |
3 #include "SkMathPriv.h" | 3 #include "SkMathPriv.h" |
4 #include "SkXfermode.h" | 4 #include "SkXfermode.h" |
5 #include "SkXfermode_opts_SSE2.h" | 5 #include "SkXfermode_opts_SSE2.h" |
6 #include "SkXfermode_proccoeff.h" | 6 #include "SkXfermode_proccoeff.h" |
7 | 7 |
8 //////////////////////////////////////////////////////////////////////////////// | 8 //////////////////////////////////////////////////////////////////////////////// |
9 // 4 pixels SSE2 version functions | 9 // 4 pixels SSE2 version functions |
10 //////////////////////////////////////////////////////////////////////////////// | 10 //////////////////////////////////////////////////////////////////////////////// |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
77 __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da); | 77 __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da); |
78 | 78 |
79 | 79 |
80 __m128i sb = SkGetPackedB32_SSE2(src); | 80 __m128i sb = SkGetPackedB32_SSE2(src); |
81 __m128i db = SkGetPackedB32_SSE2(dst); | 81 __m128i db = SkGetPackedB32_SSE2(dst); |
82 __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da); | 82 __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da); |
83 | 83 |
84 return SkPackARGB32_SSE2(a, r, g, b); | 84 return SkPackARGB32_SSE2(a, r, g, b); |
85 } | 85 } |
86 | 86 |
87 //////////////////////////////////////////////////////////////////////////////// | 87 static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) { |
88 __m128i r1 = _mm_mul_epu32(a, b); | |
89 __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); | |
90 __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)), | |
91 _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0))); | |
92 return r; | |
93 } | |
94 | |
95 // Portable version of SkSqrtBits is in SkMath.cpp. | |
96 static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) { | |
mtklein
2014/04/24 17:24:11
Think it's worth spinning off an SkMath_SSE2?
qiankun
2014/04/25 08:13:06
Done. Create a new file SkMath_opts_SSE2.h to hold
| |
97 __m128i root = _mm_setzero_si128(); | |
98 __m128i remHi = _mm_setzero_si128(); | |
99 __m128i remLo = x; | |
100 __m128i one128 = _mm_set1_epi32(1); | |
101 | |
102 do { | |
103 root = _mm_slli_epi32(root, 1); | |
104 | |
105 remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2), | |
106 _mm_srli_epi32(remLo, 30)); | |
107 remLo = _mm_slli_epi32(remLo, 2); | |
108 | |
109 __m128i testDiv = _mm_slli_epi32(root, 1); | |
110 testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1)); | |
111 | |
112 __m128i cmp = _mm_cmplt_epi32(remHi, testDiv); | |
113 __m128i remHi1 = _mm_and_si128(cmp, remHi); | |
114 __m128i root1 = _mm_and_si128(cmp, root); | |
115 __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv)); | |
116 __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128)); | |
117 | |
118 remHi = _mm_or_si128(remHi1, remHi2); | |
119 root = _mm_or_si128(root1, root2); | |
120 } while (--count >= 0); | |
121 | |
122 return root; | |
123 } | |
124 | |
125 static __m128i sqrt_unit_byte_SSE2(const __m128i& n) { | |
126 return SkSqrtBits_SSE2(n, 15+4); | |
127 } | |
128 | |
129 static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, | |
130 const __m128i& sa, const __m128i& da) { | |
131 __m128i tmp1, tmp2, tmp3; | |
132 | |
133 // int m = da ? dc * 256 / da : 0; | |
134 __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); | |
135 __m128i m = _mm_slli_epi32(dc, 8); | |
136 __m128 x = _mm_cvtepi32_ps(m); | |
137 __m128 y = _mm_cvtepi32_ps(da); | |
138 m = _mm_cvttps_epi32(_mm_div_ps(x, y)); | |
139 m = _mm_andnot_si128(cmp, m); | |
140 | |
141 // if (2 * sc <= sa) | |
142 tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc | |
143 __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); | |
144 tmp1 = _mm_sub_epi32(tmp1, sa); // 2*sc - sa | |
145 tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m | |
146 tmp1 = Multiply32_SSE2(tmp1, tmp2); | |
147 tmp1 = _mm_srai_epi32(tmp1, 8); | |
148 tmp1 = _mm_add_epi32(sa, tmp1); | |
149 tmp1 = Multiply32_SSE2(dc, tmp1); | |
150 __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); | |
151 | |
152 // else if (4 * dc <= da) | |
153 tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 | |
154 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); | |
155 __m128i i = _mm_slli_epi32(m, 2); // 4 * m | |
156 __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 | |
157 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) | |
158 __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 | |
159 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) | |
160 i = _mm_srai_epi32(i, 16); // >> 16 | |
161 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m | |
162 tmp2 = _mm_add_epi32(i, j); | |
163 i = Multiply32_SSE2(dc, sa); // dc * sa | |
164 j = _mm_slli_epi32(sc, 1); // 2 * sc | |
165 j = _mm_sub_epi32(j, sa); // 2 * sc - sa | |
166 j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) | |
167 tmp2 = Multiply32_SSE2(j, tmp2); // * tmp | |
168 tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 | |
169 tmp2 = _mm_add_epi32(i, tmp2); | |
170 cmp = _mm_andnot_si128(cmp2, cmp1); | |
171 __m128i rc2 = _mm_and_si128(cmp, tmp2); | |
172 __m128i rc = _mm_or_si128(rc1, rc2); | |
173 | |
174 // else | |
175 tmp3 = sqrt_unit_byte_SSE2(m); | |
176 tmp3 = _mm_sub_epi32(tmp3, m); | |
177 tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) | |
178 tmp3 = _mm_srai_epi32(tmp3, 8); | |
179 tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa | |
180 cmp = _mm_and_si128(cmp1, cmp2); | |
181 __m128i rc3 = _mm_and_si128(cmp, tmp3); | |
182 rc = _mm_or_si128(rc, rc3); | |
183 | |
184 tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da | |
185 tmp1 = _mm_mullo_epi16(sc, tmp1); | |
186 tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa | |
187 tmp2 = _mm_mullo_epi16(dc, tmp2); | |
188 rc = _mm_add_epi32(rc, tmp1); | |
189 rc = _mm_add_epi32(rc, tmp2); | |
190 return clamp_div255round_SSE2(rc); | |
191 } | |
192 | |
193 static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) { | |
194 __m128i sa = SkGetPackedA32_SSE2(src); | |
195 __m128i da = SkGetPackedA32_SSE2(dst); | |
196 | |
197 __m128i a = srcover_byte_SSE2(sa, da); | |
198 __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src), | |
199 SkGetPackedR32_SSE2(dst), sa, da); | |
200 __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src), | |
201 SkGetPackedG32_SSE2(dst), sa, da); | |
202 __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src), | |
203 SkGetPackedB32_SSE2(dst), sa, da); | |
204 return SkPackARGB32_SSE2(a, r, g, b); | |
205 } | |
206 /////////////////////////////////////////////////////////////////////////////// | |
88 | 207 |
89 typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); | 208 typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); |
90 | 209 |
91 extern SkXfermodeProcSIMD gSSE2XfermodeProcs[]; | 210 extern SkXfermodeProcSIMD gSSE2XfermodeProcs[]; |
92 | 211 |
93 SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) | 212 SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) |
94 : INHERITED(buffer) { | 213 : INHERITED(buffer) { |
95 fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]); | 214 fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]); |
96 } | 215 } |
97 | 216 |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
238 NULL, // kPlus_Mode | 357 NULL, // kPlus_Mode |
239 NULL, // kModulate_Mode | 358 NULL, // kModulate_Mode |
240 NULL, // kScreen_Mode | 359 NULL, // kScreen_Mode |
241 | 360 |
242 NULL, // kOverlay_Mode | 361 NULL, // kOverlay_Mode |
243 NULL, // kDarken_Mode | 362 NULL, // kDarken_Mode |
244 NULL, // kLighten_Mode | 363 NULL, // kLighten_Mode |
245 NULL, // kColorDodge_Mode | 364 NULL, // kColorDodge_Mode |
246 NULL, // kColorBurn_Mode | 365 NULL, // kColorBurn_Mode |
247 NULL, // kHardLight_Mode | 366 NULL, // kHardLight_Mode |
248 NULL, // kSoftLight_Mode | 367 softlight_modeproc_SSE2, |
249 NULL, // kDifference_Mode | 368 NULL, // kDifference_Mode |
250 NULL, // kExclusion_Mode | 369 NULL, // kExclusion_Mode |
251 multiply_modeproc_SSE2, | 370 multiply_modeproc_SSE2, |
252 | 371 |
253 NULL, // kHue_Mode | 372 NULL, // kHue_Mode |
254 NULL, // kSaturation_Mode | 373 NULL, // kSaturation_Mode |
255 NULL, // kColor_Mode | 374 NULL, // kColor_Mode |
256 NULL, // kLuminosity_Mode | 375 NULL, // kLuminosity_Mode |
257 }; | 376 }; |
258 | 377 |
259 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, | 378 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, |
260 SkXfermode::Mode mode) { | 379 SkXfermode::Mode mode) { |
261 void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]); | 380 void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]); |
262 | 381 |
263 if (procSIMD != NULL) { | 382 if (procSIMD != NULL) { |
264 return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD)); | 383 return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD)); |
265 } | 384 } |
266 return NULL; | 385 return NULL; |
267 } | 386 } |
OLD | NEW |