Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 /* | 8 /* |
| 9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q | 9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
| 10 */ | 10 */ |
| (...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 120 | 120 |
| 121 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | 121 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 122 | 122 |
| 123 void srcover_srgb_srgb( | 123 void srcover_srgb_srgb( |
| 124 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { | 124 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { |
| 125 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); | 125 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
| 126 while (ndst > 0) { | 126 while (ndst > 0) { |
| 127 int count = SkTMin(ndst, nsrc); | 127 int count = SkTMin(ndst, nsrc); |
| 128 ndst -= count; | 128 ndst -= count; |
| 129 const uint32_t* src = srcStart; | 129 const uint32_t* src = srcStart; |
| 130 const uint32_t* end = src + (count & ~3); | 130 const uint32_t* end = dst + (count & ~3); |
| 131 ptrdiff_t delta = src - dst; | |
| 131 | 132 |
| 132 while (src < end) { | 133 while (dst < end) { |
| 133 __m128i pixels = load(src); | 134 __m128i pixels = load(src); |
| 134 if (_mm_testc_si128(pixels, alphaMask)) { | 135 if (_mm_testc_si128(pixels, alphaMask)) { |
| 136 uint32_t* start = dst; | |
| 135 do { | 137 do { |
| 136 store(dst, pixels); | 138 store(dst, pixels); |
| 137 dst += 4; | 139 dst += 4; |
| 138 src += 4; | 140 } while (dst < end |
| 139 } while (src < end && _mm_testc_si128(pixels = load(src) , alphaMask)); | 141 && _mm_testc_si128(pixels = load(dst + delta), alphaMask)); |
| 142 src += dst - start; | |
| 140 } else if (_mm_testz_si128(pixels, alphaMask)) { | 143 } else if (_mm_testz_si128(pixels, alphaMask)) { |
| 141 do { | 144 do { |
| 142 dst += 4; | 145 dst += 4; |
| 143 src += 4; | 146 src += 4; |
| 144 } while (src < end && _mm_testz_si128(pixels = load(src) , alphaMask)); | 147 } while (dst < end |
| 148 && _mm_testz_si128(pixels = load(src), alphaMas k)); | |
| 145 } else { | 149 } else { |
| 150 uint32_t* start = dst; | |
| 146 do { | 151 do { |
| 147 srcover_srgb_srgb_4(dst, src); | 152 srcover_srgb_srgb_4(dst, dst + delta); |
| 148 dst += 4; | 153 dst += 4; |
| 149 src += 4; | 154 } while (dst < end |
| 150 } while (src < end && _mm_testnzc_si128(pixels = load(sr c), alphaMask)); | 155 && _mm_testnzc_si128(pixels = load(dst + delta) , alphaMask)); |
| 156 src += dst - start; | |
| 151 } | 157 } |
| 152 } | 158 } |
| 153 | 159 |
| 154 count = count & 3; | 160 count = count & 3; |
| 155 while (count-- > 0) { | 161 while (count-- > 0) { |
| 156 srcover_srgb_srgb_1(dst++, *src++); | 162 srcover_srgb_srgb_1(dst++, *src++); |
| 157 } | 163 } |
| 158 } | 164 } |
| 159 } | 165 } |
| 160 #else | 166 #else |
| 161 // SSE2 versions | 167 // SSE2 versions |
| 168 | |
| 169 // Note: In the next three comparisons a group of 4 pixels is converted to a group of | |
| 170 // "signed" pixels because the sse2 does not have an unsigned comparison . | |
| 171 // Make it so that we can use the signed comparison operators by biasing | |
| 172 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0x ffxxxxxx to | |
| 173 // 0x7fxxxxxx which is the largest set of values. | |
| 162 static inline bool check_opaque_alphas(__m128i pixels) { | 174 static inline bool check_opaque_alphas(__m128i pixels) { |
| 175 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); | |
| 163 int mask = | 176 int mask = |
| 164 _mm_movemask_epi8( | 177 _mm_movemask_epi8( |
| 165 _mm_cmpeq_epi32( | 178 _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000))); |
| 166 _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), | 179 return mask == 0; |
| 167 _mm_setzero_si128())); | |
| 168 return mask == 0xFFFF; | |
| 169 } | 180 } |
| 170 | 181 |
| 171 static inline bool check_transparent_alphas(__m128i pixels) { | 182 static inline bool check_transparent_alphas(__m128i pixels) { |
| 183 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); | |
| 172 int mask = | 184 int mask = |
| 173 _mm_movemask_epi8( | 185 _mm_movemask_epi8( |
| 174 _mm_cmpeq_epi32( | 186 _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF))); |
|
mtklein
2016/05/24 12:15:37
Can't we trim the xor here by exploiting the const
| |
| 175 _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), | 187 return mask == 0; |
| 176 _mm_setzero_si128())); | |
| 177 return mask == 0xFFFF; | |
| 178 } | 188 } |
| 179 | 189 |
| 180 static inline bool check_partial_alphas(__m128i pixels) { | 190 static inline bool check_partial_alphas(__m128i pixels) { |
| 181 __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); | 191 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x800000 00)); |
| 182 int mask = | 192 __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32( 0x7F000000)); |
|
mtklein
2016/05/24 13:00:13
I think we can make this logic clearer.
To start,
| |
| 183 _mm_movemask_epi8( | 193 __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32( 0x80FFFFFF)); |
| 184 _mm_cmpeq_epi8( | 194 int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, trans parent)); |
| 185 _mm_srai_epi32(alphas, 8), | 195 return mask == 0; |
| 186 alphas)); | |
| 187 return mask == 0xFFFF; | |
| 188 } | 196 } |
| 189 | 197 |
| 190 void srcover_srgb_srgb( | 198 void srcover_srgb_srgb( |
| 191 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { | 199 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) { |
| 192 while (ndst > 0) { | 200 while (ndst > 0) { |
| 193 int count = SkTMin(ndst, nsrc); | 201 int count = SkTMin(ndst, nsrc); |
| 194 ndst -= count; | 202 ndst -= count; |
| 195 const uint32_t* src = srcStart; | 203 const uint32_t* src = srcStart; |
| 196 const uint32_t* end = src + (count & ~3); | 204 const uint32_t* end = dst + (count & ~3); |
| 205 const ptrdiff_t delta = src - dst; | |
| 197 | 206 |
| 198 __m128i pixels = load(src); | 207 __m128i pixels = load(src); |
| 199 do { | 208 do { |
| 200 if (check_opaque_alphas(pixels)) { | 209 if (check_opaque_alphas(pixels)) { |
| 210 uint32_t* start = dst; | |
| 201 do { | 211 do { |
| 202 store(dst, pixels); | 212 store(dst, pixels); |
| 203 dst += 4; | 213 dst += 4; |
| 204 src += 4; | 214 } while (dst < end && check_opaque_alphas((pixels = load (dst + delta)))); |
| 205 } while (src < end && check_opaque_alphas(pixels = load( src))); | 215 src += dst - start; |
| 206 } else if (check_transparent_alphas(pixels)) { | 216 } else if (check_transparent_alphas(pixels)) { |
| 207 const uint32_t* start = src; | 217 const uint32_t* start = dst; |
| 208 do { | 218 do { |
| 209 src += 4; | 219 dst += 4; |
| 210 } while (src < end && check_transparent_alphas(pixels = load(src))); | 220 } while (dst < end && check_transparent_alphas(pixels = load(dst + delta))); |
| 211 dst += src - start; | 221 src += dst - start; |
| 212 } else { | 222 } else { |
| 223 const uint32_t* start = dst; | |
| 213 do { | 224 do { |
| 214 srcover_srgb_srgb_4(dst, src); | 225 srcover_srgb_srgb_4(dst, dst + delta); |
| 215 dst += 4; | 226 dst += 4; |
| 216 src += 4; | 227 } while (dst < end && check_partial_alphas(pixels = load (dst + delta))); |
| 217 } while (src < end && check_partial_alphas(pixels = load (src))); | 228 src += dst - start; |
| 218 } | 229 } |
| 219 } while (src < end); | 230 } while (dst < end); |
| 220 | 231 |
| 221 count = count & 3; | 232 count = count & 3; |
| 222 while (count-- > 0) { | 233 while (count-- > 0) { |
| 223 srcover_srgb_srgb_1(dst++, *src++); | 234 srcover_srgb_srgb_1(dst++, *src++); |
| 224 } | 235 } |
| 225 } | 236 } |
| 226 } | 237 } |
| 227 #endif | 238 #endif |
| 228 #else | 239 #else |
| 229 | 240 |
| 230 void srcover_srgb_srgb( | 241 void srcover_srgb_srgb( |
| 231 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | 242 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| 232 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); | 243 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); |
| 233 } | 244 } |
| 234 | 245 |
| 235 #endif | 246 #endif |
| 236 | 247 |
| 237 } // namespace SK_OPTS_NS | 248 } // namespace SK_OPTS_NS |
| 238 | 249 |
| 239 #endif//SkBlend_opts_DEFINED | 250 #endif//SkBlend_opts_DEFINED |
| OLD | NEW |