OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
67 } | 67 } |
68 | 68 |
69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, | 69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
70 const SkPMColor* SK_RESTRICT src, | 70 const SkPMColor* SK_RESTRICT src, |
71 int count, U8CPU alpha) { | 71 int count, U8CPU alpha) { |
72 SkASSERT(alpha == 255); | 72 SkASSERT(alpha == 255); |
73 if (count <= 0) { | 73 if (count <= 0) { |
74 return; | 74 return; |
75 } | 75 } |
76 | 76 |
| 77 #ifdef SK_USE_ACCURATE_BLENDING |
77 if (count >= 4) { | 78 if (count >= 4) { |
78 SkASSERT(((size_t)dst & 0x03) == 0); | 79 SkASSERT(((size_t)dst & 0x03) == 0); |
79 while (((size_t)dst & 0x0F) != 0) { | 80 while (((size_t)dst & 0x0F) != 0) { |
80 *dst = SkPMSrcOver(*src, *dst); | 81 *dst = SkPMSrcOver(*src, *dst); |
81 src++; | 82 src++; |
82 dst++; | 83 dst++; |
83 count--; | 84 count--; |
84 } | 85 } |
85 | 86 |
86 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 87 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
87 __m128i *d = reinterpret_cast<__m128i*>(dst); | 88 __m128i *d = reinterpret_cast<__m128i*>(dst); |
88 #ifdef SK_USE_ACCURATE_BLENDING | |
89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | 89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) | 90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) |
91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) | 91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) |
92 while (count >= 4) { | 92 while (count >= 4) { |
93 // Load 4 pixels | 93 // Load 4 pixels |
94 __m128i src_pixel = _mm_loadu_si128(s); | 94 __m128i src_pixel = _mm_loadu_si128(s); |
95 __m128i dst_pixel = _mm_load_si128(d); | 95 __m128i dst_pixel = _mm_load_si128(d); |
96 | 96 |
97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | 97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | 98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
(...skipping 28 matching lines...) Expand all Loading... |
127 // Combine back into RGBA. | 127 // Combine back into RGBA. |
128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | 128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
129 | 129 |
130 // Add result | 130 // Add result |
131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
132 _mm_store_si128(d, result); | 132 _mm_store_si128(d, result); |
133 s++; | 133 s++; |
134 d++; | 134 d++; |
135 count -= 4; | 135 count -= 4; |
136 } | 136 } |
137 #else | |
138 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
139 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) | |
140 while (count >= 4) { | |
141 // Load 4 pixels | |
142 __m128i src_pixel = _mm_loadu_si128(s); | |
143 __m128i dst_pixel = _mm_load_si128(d); | |
144 | |
145 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
146 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
147 | |
148 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) | |
149 __m128i alpha = _mm_srli_epi16(src_pixel, 8); | |
150 | |
151 // (a0, a0, a1, a1, a2, g2, a3, g3) | |
152 alpha = _mm_shufflehi_epi16(alpha, 0xF5); | |
153 | |
154 // (a0, a0, a1, a1, a2, a2, a3, a3) | |
155 alpha = _mm_shufflelo_epi16(alpha, 0xF5); | |
156 | |
157 // Subtract alphas from 256, to get 1..256 | |
158 alpha = _mm_sub_epi16(c_256, alpha); | |
159 | |
160 // Multiply by red and blue by src alpha. | |
161 dst_rb = _mm_mullo_epi16(dst_rb, alpha); | |
162 // Multiply by alpha and green by src alpha. | |
163 dst_ag = _mm_mullo_epi16(dst_ag, alpha); | |
164 | |
165 // Divide by 256. | |
166 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
167 | |
168 // Mask out high bits (already in the right place) | |
169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
170 | |
171 // Combine back into RGBA. | |
172 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
173 | |
174 // Add result | |
175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | |
176 _mm_store_si128(d, result); | |
177 s++; | |
178 d++; | |
179 count -= 4; | |
180 } | |
181 #endif | |
182 src = reinterpret_cast<const SkPMColor*>(s); | 137 src = reinterpret_cast<const SkPMColor*>(s); |
183 dst = reinterpret_cast<SkPMColor*>(d); | 138 dst = reinterpret_cast<SkPMColor*>(d); |
184 } | 139 } |
185 | 140 |
186 while (count > 0) { | 141 while (count > 0) { |
187 *dst = SkPMSrcOver(*src, *dst); | 142 *dst = SkPMSrcOver(*src, *dst); |
188 src++; | 143 src++; |
189 dst++; | 144 dst++; |
190 count--; | 145 count--; |
191 } | 146 } |
| 147 #else |
| 148 int count16 = count / 16; |
| 149 __m128i* dst4 = (__m128i*)dst; |
| 150 const __m128i* src4 = (const __m128i*)src; |
| 151 |
| 152 for (int i = 0; i < count16 * 4; i += 4) { |
| 153 // Load 16 source pixels. |
| 154 __m128i s0 = _mm_loadu_si128(src4+i+0), |
| 155 s1 = _mm_loadu_si128(src4+i+1), |
| 156 s2 = _mm_loadu_si128(src4+i+2), |
| 157 s3 = _mm_loadu_si128(src4+i+3); |
| 158 |
| 159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT); |
| 160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1,
s0))); |
| 161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero
_si128()); |
| 162 if (0xffff == _mm_movemask_epi8(cmp)) { |
| 163 // All 16 source pixels are fully transparent. There's nothing to do
! |
| 164 continue; |
| 165 } |
| 166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(
s1, s0))); |
| 167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask); |
| 168 if (0xffff == _mm_movemask_epi8(cmp)) { |
| 169 // All 16 source pixels are fully opaque. There's no need to read ds
t or blend it. |
| 170 _mm_storeu_si128(dst4+i+0, s0); |
| 171 _mm_storeu_si128(dst4+i+1, s1); |
| 172 _mm_storeu_si128(dst4+i+2, s2); |
| 173 _mm_storeu_si128(dst4+i+3, s3); |
| 174 continue; |
| 175 } |
| 176 // The general slow case: do the blend for all 16 pixels. |
| 177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0
))); |
| 178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1
))); |
| 179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2
))); |
| 180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3
))); |
| 181 } |
| 182 |
| 183 // Wrap up the last <= 15 pixels. |
| 184 SkASSERT(count - (count16*16) <= 15); |
| 185 for (int i = count16*16; i < count; i++) { |
| 186 // This check is not really necessarily, but it prevents pointless autov
ectorization. |
| 187 if (src[i] & 0xFF000000) { |
| 188 dst[i] = SkPMSrcOver(src[i], dst[i]); |
| 189 } |
| 190 } |
| 191 #endif |
192 } | 192 } |
193 | 193 |
194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, | 194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
195 const SkPMColor* SK_RESTRICT src, | 195 const SkPMColor* SK_RESTRICT src, |
196 int count, U8CPU alpha) { | 196 int count, U8CPU alpha) { |
197 SkASSERT(alpha <= 255); | 197 SkASSERT(alpha <= 255); |
198 if (count <= 0) { | 198 if (count <= 0) { |
199 return; | 199 return; |
200 } | 200 } |
201 | 201 |
(...skipping 944 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
1148 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1148 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1150 } | 1150 } |
1151 dst += 1; | 1151 dst += 1; |
1152 DITHER_INC_X(x); | 1152 DITHER_INC_X(x); |
1153 } while (--count != 0); | 1153 } while (--count != 0); |
1154 } | 1154 } |
1155 } | 1155 } |
OLD | NEW |