OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 /* | 8 /* |
9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q | 9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
10 */ | 10 */ |
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
120 | 120 |
121 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | 121 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
122 | 122 |
123 void srcover_srgb_srgb( | 123 void srcover_srgb_srgb( |
124 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { | 124 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { |
125 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); | 125 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
126 while (ndst > 0) { | 126 while (ndst > 0) { |
127 int count = SkTMin(ndst, nsrc); | 127 int count = SkTMin(ndst, nsrc); |
128 ndst -= count; | 128 ndst -= count; |
129 const uint32_t* src = srcStart; | 129 const uint32_t* src = srcStart; |
130 const uint32_t* end = src + (count & ~3); | 130 const uint32_t* end = dst + (count & ~3); |
131 ptrdiff_t delta = src - dst; | |
131 | 132 |
132 while (src < end) { | 133 while (dst < end) { |
133 __m128i pixels = load(src); | 134 __m128i pixels = load(src); |
134 if (_mm_testc_si128(pixels, alphaMask)) { | 135 if (_mm_testc_si128(pixels, alphaMask)) { |
136 uint32_t* start = dst; | |
135 do { | 137 do { |
136 store(dst, pixels); | 138 store(dst, pixels); |
137 dst += 4; | 139 dst += 4; |
138 src += 4; | 140 } while (dst < end |
139 } while (src < end && _mm_testc_si128(pixels = load(src) , alphaMask)); | 141 && _mm_testc_si128(pixels = load(dst + delta), alphaMask)); |
142 src += dst - start; | |
140 } else if (_mm_testz_si128(pixels, alphaMask)) { | 143 } else if (_mm_testz_si128(pixels, alphaMask)) { |
141 do { | 144 do { |
142 dst += 4; | 145 dst += 4; |
143 src += 4; | 146 src += 4; |
144 } while (src < end && _mm_testz_si128(pixels = load(src) , alphaMask)); | 147 } while (dst < end |
148 && _mm_testz_si128(pixels = load(src), alphaMas k)); | |
145 } else { | 149 } else { |
150 uint32_t* start = dst; | |
146 do { | 151 do { |
147 srcover_srgb_srgb_4(dst, src); | 152 srcover_srgb_srgb_4(dst, dst + delta); |
148 dst += 4; | 153 dst += 4; |
149 src += 4; | 154 } while (dst < end |
150 } while (src < end && _mm_testnzc_si128(pixels = load(sr c), alphaMask)); | 155 && _mm_testnzc_si128(pixels = load(dst + delta) , alphaMask)); |
156 src += dst - start; | |
151 } | 157 } |
152 } | 158 } |
153 | 159 |
154 count = count & 3; | 160 count = count & 3; |
155 while (count-- > 0) { | 161 while (count-- > 0) { |
156 srcover_srgb_srgb_1(dst++, *src++); | 162 srcover_srgb_srgb_1(dst++, *src++); |
157 } | 163 } |
158 } | 164 } |
159 } | 165 } |
160 #else | 166 #else |
161 // SSE2 versions | 167 // SSE2 versions |
168 | |
169 // Note: In the next three comparisons a group of 4 pixels is converted to a group of | |
170 // "signed" pixels because the sse2 does not have an unsigned comparison . | |
171 // Make it so that we can use the signed comparison operators by biasing | |
172 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0x ffxxxxxx to | |
173 // 0x7fxxxxxx which is the largest set of values. | |
162 static inline bool check_opaque_alphas(__m128i pixels) { | 174 static inline bool check_opaque_alphas(__m128i pixels) { |
175 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); | |
163 int mask = | 176 int mask = |
164 _mm_movemask_epi8( | 177 _mm_movemask_epi8( |
165 _mm_cmpeq_epi32( | 178 _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000))); |
166 _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), | 179 return mask == 0; |
167 _mm_setzero_si128())); | |
168 return mask == 0xFFFF; | |
169 } | 180 } |
170 | 181 |
171 static inline bool check_transparent_alphas(__m128i pixels) { | 182 static inline bool check_transparent_alphas(__m128i pixels) { |
183 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); | |
172 int mask = | 184 int mask = |
173 _mm_movemask_epi8( | 185 _mm_movemask_epi8( |
174 _mm_cmpeq_epi32( | 186 _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF))); |
mtklein
2016/05/24 12:15:37
Can't we trim the xor here by exploiting the const
| |
175 _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), | 187 return mask == 0; |
176 _mm_setzero_si128())); | |
177 return mask == 0xFFFF; | |
178 } | 188 } |
179 | 189 |
180 static inline bool check_partial_alphas(__m128i pixels) { | 190 static inline bool check_partial_alphas(__m128i pixels) { |
181 __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); | 191 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); |
182 int mask = | 192 __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32( 0x7F000000)); |
mtklein
2016/05/24 13:00:13
I think we can make this logic clearer.
To start,
| |
183 _mm_movemask_epi8( | 193 __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32( 0x80FFFFFF)); |
184 _mm_cmpeq_epi8( | 194 int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, trans parent)); |
185 _mm_srai_epi32(alphas, 8), | 195 return mask == 0; |
186 alphas)); | |
187 return mask == 0xFFFF; | |
188 } | 196 } |
189 | 197 |
190 void srcover_srgb_srgb( | 198 void srcover_srgb_srgb( |
191 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { | 199 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { |
192 while (ndst > 0) { | 200 while (ndst > 0) { |
193 int count = SkTMin(ndst, nsrc); | 201 int count = SkTMin(ndst, nsrc); |
194 ndst -= count; | 202 ndst -= count; |
195 const uint32_t* src = srcStart; | 203 const uint32_t* src = srcStart; |
196 const uint32_t* end = src + (count & ~3); | 204 const uint32_t* end = dst + (count & ~3); |
205 const ptrdiff_t delta = src - dst; | |
197 | 206 |
198 __m128i pixels = load(src); | 207 __m128i pixels = load(src); |
199 do { | 208 do { |
200 if (check_opaque_alphas(pixels)) { | 209 if (check_opaque_alphas(pixels)) { |
210 uint32_t* start = dst; | |
201 do { | 211 do { |
202 store(dst, pixels); | 212 store(dst, pixels); |
203 dst += 4; | 213 dst += 4; |
204 src += 4; | 214 } while (dst < end && check_opaque_alphas((pixels = load (dst + delta)))); |
205 } while (src < end && check_opaque_alphas(pixels = load( src))); | 215 src += dst - start; |
206 } else if (check_transparent_alphas(pixels)) { | 216 } else if (check_transparent_alphas(pixels)) { |
207 const uint32_t* start = src; | 217 const uint32_t* start = dst; |
208 do { | 218 do { |
209 src += 4; | 219 dst += 4; |
210 } while (src < end && check_transparent_alphas(pixels = load(src))); | 220 } while (dst < end && check_transparent_alphas(pixels = load(dst + delta))); |
211 dst += src - start; | 221 src += dst - start; |
212 } else { | 222 } else { |
223 const uint32_t* start = dst; | |
213 do { | 224 do { |
214 srcover_srgb_srgb_4(dst, src); | 225 srcover_srgb_srgb_4(dst, dst + delta); |
215 dst += 4; | 226 dst += 4; |
216 src += 4; | 227 } while (dst < end && check_partial_alphas(pixels = load (dst + delta))); |
217 } while (src < end && check_partial_alphas(pixels = load (src))); | 228 src += dst - start; |
218 } | 229 } |
219 } while (src < end); | 230 } while (dst < end); |
220 | 231 |
221 count = count & 3; | 232 count = count & 3; |
222 while (count-- > 0) { | 233 while (count-- > 0) { |
223 srcover_srgb_srgb_1(dst++, *src++); | 234 srcover_srgb_srgb_1(dst++, *src++); |
224 } | 235 } |
225 } | 236 } |
226 } | 237 } |
227 #endif | 238 #endif |
228 #else | 239 #else |
229 | 240 |
230 void srcover_srgb_srgb( | 241 void srcover_srgb_srgb( |
231 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | 242 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
232 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); | 243 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); |
233 } | 244 } |
234 | 245 |
235 #endif | 246 #endif |
236 | 247 |
237 } // namespace SK_OPTS_NS | 248 } // namespace SK_OPTS_NS |
238 | 249 |
239 #endif//SkBlend_opts_DEFINED | 250 #endif//SkBlend_opts_DEFINED |
OLD | NEW |