| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The Android Open Source Project | 2 * Copyright 2014 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkColor_opts_SSE2_DEFINED | 8 #ifndef SkColor_opts_SSE2_DEFINED |
| 9 #define SkColor_opts_SSE2_DEFINED | 9 #define SkColor_opts_SSE2_DEFINED |
| 10 | 10 |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 73 __m128i rb = _mm_and_si128(mask, c); | 73 __m128i rb = _mm_and_si128(mask, c); |
| 74 rb = _mm_mulhi_epu16(rb, s); | 74 rb = _mm_mulhi_epu16(rb, s); |
| 75 | 75 |
| 76 __m128i ag = _mm_andnot_si128(mask, c); | 76 __m128i ag = _mm_andnot_si128(mask, c); |
| 77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher
byte of each word. | 77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher
byte of each word. |
| 78 ag = _mm_andnot_si128(mask, ag); | 78 ag = _mm_andnot_si128(mask, ag); |
| 79 | 79 |
| 80 return _mm_or_si128(rb, ag); | 80 return _mm_or_si128(rb, ag); |
| 81 } | 81 } |
| 82 | 82 |
| 83 // Portable version SkFastFourByteInterp256 is in SkColorPriv.h. |
| 84 static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m
128i& dst, const unsigned src_scale) { |
| 85 // Computes dst + (((src - dst)*src_scale)>>8) |
| 86 const __m128i mask = _mm_set1_epi32(0x00FF00FF); |
| 87 |
| 88 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves. |
| 89 __m128i src_rb = _mm_and_si128(mask, src); |
| 90 __m128i src_ag = _mm_srli_epi16(src, 8); |
| 91 __m128i dst_rb = _mm_and_si128(mask, dst); |
| 92 __m128i dst_ag = _mm_srli_epi16(dst, 8); |
| 93 |
| 94 // Compute scaled differences. |
| 95 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb); |
| 96 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag); |
| 97 __m128i s = _mm_set1_epi16(src_scale); |
| 98 diff_rb = _mm_mullo_epi16(diff_rb, s); |
| 99 diff_ag = _mm_mullo_epi16(diff_ag, s); |
| 100 |
| 101 // Pack the differences back together. |
| 102 diff_rb = _mm_srli_epi16(diff_rb, 8); |
| 103 diff_ag = _mm_andnot_si128(mask, diff_ag); |
| 104 __m128i diff = _mm_or_si128(diff_rb, diff_ag); |
| 105 |
| 106 // Add difference to destination. |
| 107 return _mm_add_epi8(dst, diff); |
| 108 } |
| 109 |
| 110 // Portable version SkPMLerp is in SkColorPriv.h |
| 111 static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, cons
t unsigned scale) { |
| 112 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
| 113 return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256
- scale)); |
| 114 #else |
| 115 return SkFastFourByteInterp256_SSE2(src, dst, scale); |
| 116 #endif |
| 117 } |
| 118 |
| 83 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { | 119 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { |
| 84 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph
a is the top byte. | 120 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph
a is the top byte. |
| 85 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th
e left shift then, | 121 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th
e left shift then, |
| 86 #else // but I've seen Clang just do a dumb left
shift of zero. :( | 122 #else // but I've seen Clang just do a dumb left
shift of zero. :( |
| 87 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); | 123 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); |
| 88 return _mm_srli_epi32(a, 24); | 124 return _mm_srli_epi32(a, 24); |
| 89 #endif | 125 #endif |
| 90 } | 126 } |
| 91 | 127 |
| 92 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) { | 128 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) { |
| (...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 206 return d_pixel; | 242 return d_pixel; |
| 207 } | 243 } |
| 208 | 244 |
| 209 // Portable version is SkPMSrcOver in SkColorPriv.h. | 245 // Portable version is SkPMSrcOver in SkColorPriv.h. |
| 210 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { | 246 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { |
| 211 return _mm_add_epi32(src, | 247 return _mm_add_epi32(src, |
| 212 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256)
, | 248 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256)
, |
| 213 SkGetPackedA32_SSE2
(src)))); | 249 SkGetPackedA32_SSE2
(src)))); |
| 214 } | 250 } |
| 215 | 251 |
| 216 // Portable version is SkBlendARGB32 in SkColorPriv.h. | |
| 217 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, | |
| 218 const __m128i& aa) { | |
| 219 __m128i src_scale = SkAlpha255To256_SSE2(aa); | |
| 220 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) | |
| 221 __m128i dst_scale = SkGetPackedA32_SSE2(src); | |
| 222 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); | |
| 223 dst_scale = _mm_srli_epi16(dst_scale, 8); | |
| 224 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); | |
| 225 | |
| 226 __m128i result = SkAlphaMulQ_SSE2(src, src_scale); | |
| 227 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); | |
| 228 } | |
| 229 | |
| 230 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor. | 252 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor. |
| 231 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, | 253 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, |
| 232 const unsigned aa) { | 254 const unsigned aa) { |
| 233 unsigned alpha = SkAlpha255To256(aa); | 255 unsigned alpha = SkAlpha255To256(aa); |
| 234 __m128i src_scale = _mm_set1_epi32(alpha); | 256 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
| 235 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) | 257 __m128i src_scale = _mm_set1_epi32(alpha); |
| 258 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) |
| 259 __m128i dst_scale = SkGetPackedA32_SSE2(src); |
| 260 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); |
| 261 dst_scale = _mm_srli_epi16(dst_scale, 8); |
| 262 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); |
| 263 |
| 264 __m128i result = SkAlphaMulQ_SSE2(src, alpha); |
| 265 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); |
| 266 #else |
| 267 __m128i src_scale = _mm_set1_epi16(alpha); |
| 268 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale) |
| 236 __m128i dst_scale = SkGetPackedA32_SSE2(src); | 269 __m128i dst_scale = SkGetPackedA32_SSE2(src); |
| 270 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_s
cale. |
| 237 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); | 271 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); |
| 238 dst_scale = _mm_srli_epi16(dst_scale, 8); | 272 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale); |
| 239 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); | 273 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8)); |
| 274 dst_scale = _mm_srli_epi32(dst_scale, 8); |
| 275 // Duplicate scales into 2x16-bit pattern per pixel. |
| 276 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); |
| 277 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); |
| 240 | 278 |
| 241 __m128i result = SkAlphaMulQ_SSE2(src, alpha); | 279 const __m128i mask = _mm_set1_epi32(0x00FF00FF); |
| 242 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); | 280 |
| 281 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves. |
| 282 __m128i src_rb = _mm_and_si128(mask, src); |
| 283 __m128i src_ag = _mm_srli_epi16(src, 8); |
| 284 __m128i dst_rb = _mm_and_si128(mask, dst); |
| 285 __m128i dst_ag = _mm_srli_epi16(dst, 8); |
| 286 |
| 287 // Scale them. |
| 288 src_rb = _mm_mullo_epi16(src_rb, src_scale); |
| 289 src_ag = _mm_mullo_epi16(src_ag, src_scale); |
| 290 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale); |
| 291 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale); |
| 292 |
| 293 // Add the scaled source and destination. |
| 294 dst_rb = _mm_add_epi16(src_rb, dst_rb); |
| 295 dst_ag = _mm_add_epi16(src_ag, dst_ag); |
| 296 |
| 297 // Unsplay the halves back together. |
| 298 dst_rb = _mm_srli_epi16(dst_rb, 8); |
| 299 dst_ag = _mm_andnot_si128(mask, dst_ag); |
| 300 return _mm_or_si128(dst_rb, dst_ag); |
| 301 #endif |
| 243 } | 302 } |
| 244 | 303 |
| 245 #undef ASSERT_EQ | 304 #undef ASSERT_EQ |
| 246 #endif // SkColor_opts_SSE2_DEFINED | 305 #endif // SkColor_opts_SSE2_DEFINED |
| OLD | NEW |