| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
| 10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 67 } | 67 } |
| 68 | 68 |
| 69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, | 69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
| 70 const SkPMColor* SK_RESTRICT src, | 70 const SkPMColor* SK_RESTRICT src, |
| 71 int count, U8CPU alpha) { | 71 int count, U8CPU alpha) { |
| 72 SkASSERT(alpha == 255); | 72 SkASSERT(alpha == 255); |
| 73 if (count <= 0) { | 73 if (count <= 0) { |
| 74 return; | 74 return; |
| 75 } | 75 } |
| 76 | 76 |
| 77 #ifdef SK_USE_ACCURATE_BLENDING |
| 77 if (count >= 4) { | 78 if (count >= 4) { |
| 78 SkASSERT(((size_t)dst & 0x03) == 0); | 79 SkASSERT(((size_t)dst & 0x03) == 0); |
| 79 while (((size_t)dst & 0x0F) != 0) { | 80 while (((size_t)dst & 0x0F) != 0) { |
| 80 *dst = SkPMSrcOver(*src, *dst); | 81 *dst = SkPMSrcOver(*src, *dst); |
| 81 src++; | 82 src++; |
| 82 dst++; | 83 dst++; |
| 83 count--; | 84 count--; |
| 84 } | 85 } |
| 85 | 86 |
| 86 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 87 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 87 __m128i *d = reinterpret_cast<__m128i*>(dst); | 88 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 88 #ifdef SK_USE_ACCURATE_BLENDING | |
| 89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | 89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) | 90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) |
| 91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) | 91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) |
| 92 while (count >= 4) { | 92 while (count >= 4) { |
| 93 // Load 4 pixels | 93 // Load 4 pixels |
| 94 __m128i src_pixel = _mm_loadu_si128(s); | 94 __m128i src_pixel = _mm_loadu_si128(s); |
| 95 __m128i dst_pixel = _mm_load_si128(d); | 95 __m128i dst_pixel = _mm_load_si128(d); |
| 96 | 96 |
| 97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | 97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
| 98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | 98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
| (...skipping 28 matching lines...) Expand all Loading... |
| 127 // Combine back into RGBA. | 127 // Combine back into RGBA. |
| 128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | 128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
| 129 | 129 |
| 130 // Add result | 130 // Add result |
| 131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 132 _mm_store_si128(d, result); | 132 _mm_store_si128(d, result); |
| 133 s++; | 133 s++; |
| 134 d++; | 134 d++; |
| 135 count -= 4; | 135 count -= 4; |
| 136 } | 136 } |
| 137 #else | |
| 138 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 139 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) | |
| 140 while (count >= 4) { | |
| 141 // Load 4 pixels | |
| 142 __m128i src_pixel = _mm_loadu_si128(s); | |
| 143 __m128i dst_pixel = _mm_load_si128(d); | |
| 144 | |
| 145 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
| 146 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
| 147 | |
| 148 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) | |
| 149 __m128i alpha = _mm_srli_epi16(src_pixel, 8); | |
| 150 | |
| 151 // (a0, a0, a1, a1, a2, g2, a3, g3) | |
| 152 alpha = _mm_shufflehi_epi16(alpha, 0xF5); | |
| 153 | |
| 154 // (a0, a0, a1, a1, a2, a2, a3, a3) | |
| 155 alpha = _mm_shufflelo_epi16(alpha, 0xF5); | |
| 156 | |
| 157 // Subtract alphas from 256, to get 1..256 | |
| 158 alpha = _mm_sub_epi16(c_256, alpha); | |
| 159 | |
| 160 // Multiply by red and blue by src alpha. | |
| 161 dst_rb = _mm_mullo_epi16(dst_rb, alpha); | |
| 162 // Multiply by alpha and green by src alpha. | |
| 163 dst_ag = _mm_mullo_epi16(dst_ag, alpha); | |
| 164 | |
| 165 // Divide by 256. | |
| 166 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
| 167 | |
| 168 // Mask out high bits (already in the right place) | |
| 169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
| 170 | |
| 171 // Combine back into RGBA. | |
| 172 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
| 173 | |
| 174 // Add result | |
| 175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | |
| 176 _mm_store_si128(d, result); | |
| 177 s++; | |
| 178 d++; | |
| 179 count -= 4; | |
| 180 } | |
| 181 #endif | |
| 182 src = reinterpret_cast<const SkPMColor*>(s); | 137 src = reinterpret_cast<const SkPMColor*>(s); |
| 183 dst = reinterpret_cast<SkPMColor*>(d); | 138 dst = reinterpret_cast<SkPMColor*>(d); |
| 184 } | 139 } |
| 185 | 140 |
| 186 while (count > 0) { | 141 while (count > 0) { |
| 187 *dst = SkPMSrcOver(*src, *dst); | 142 *dst = SkPMSrcOver(*src, *dst); |
| 188 src++; | 143 src++; |
| 189 dst++; | 144 dst++; |
| 190 count--; | 145 count--; |
| 191 } | 146 } |
| 147 #else |
| 148 int count16 = count / 16; |
| 149 __m128i* dst4 = (__m128i*)dst; |
| 150 const __m128i* src4 = (const __m128i*)src; |
| 151 |
| 152 for (int i = 0; i < count16 * 4; i += 4) { |
| 153 // Load 16 source pixels. |
| 154 __m128i s0 = _mm_loadu_si128(src4+i+0), |
| 155 s1 = _mm_loadu_si128(src4+i+1), |
| 156 s2 = _mm_loadu_si128(src4+i+2), |
| 157 s3 = _mm_loadu_si128(src4+i+3); |
| 158 |
| 159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT); |
| 160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1,
s0))); |
| 161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero
_si128()); |
| 162 if (0xffff == _mm_movemask_epi8(cmp)) { |
| 163 // All 16 source pixels are fully transparent. There's nothing to do
! |
| 164 continue; |
| 165 } |
| 166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(
s1, s0))); |
| 167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask); |
| 168 if (0xffff == _mm_movemask_epi8(cmp)) { |
| 169 // All 16 source pixels are fully opaque. There's no need to read ds
t or blend it. |
| 170 _mm_storeu_si128(dst4+i+0, s0); |
| 171 _mm_storeu_si128(dst4+i+1, s1); |
| 172 _mm_storeu_si128(dst4+i+2, s2); |
| 173 _mm_storeu_si128(dst4+i+3, s3); |
| 174 continue; |
| 175 } |
| 176 // The general slow case: do the blend for all 16 pixels. |
| 177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0
))); |
| 178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1
))); |
| 179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2
))); |
| 180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3
))); |
| 181 } |
| 182 |
| 183 // Wrap up the last <= 15 pixels. |
| 184 SkASSERT(count - (count16*16) <= 15); |
| 185 for (int i = count16*16; i < count; i++) { |
| 186 // This check is not really necessarily, but it prevents pointless autov
ectorization. |
| 187 if (src[i] & 0xFF000000) { |
| 188 dst[i] = SkPMSrcOver(src[i], dst[i]); |
| 189 } |
| 190 } |
| 191 #endif |
| 192 } | 192 } |
| 193 | 193 |
| 194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, | 194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
| 195 const SkPMColor* SK_RESTRICT src, | 195 const SkPMColor* SK_RESTRICT src, |
| 196 int count, U8CPU alpha) { | 196 int count, U8CPU alpha) { |
| 197 SkASSERT(alpha <= 255); | 197 SkASSERT(alpha <= 255); |
| 198 if (count <= 0) { | 198 if (count <= 0) { |
| 199 return; | 199 return; |
| 200 } | 200 } |
| 201 | 201 |
| (...skipping 944 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
| 1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
| 1148 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1148 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
| 1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
| 1150 } | 1150 } |
| 1151 dst += 1; | 1151 dst += 1; |
| 1152 DITHER_INC_X(x); | 1152 DITHER_INC_X(x); |
| 1153 } while (--count != 0); | 1153 } while (--count != 0); |
| 1154 } | 1154 } |
| 1155 } | 1155 } |
| OLD | NEW |