Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(523)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 886403002: Optimize SSE2 opaque blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include <emmintrin.h> 8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h" 9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h"
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
67 } 67 }
68 68
69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
70 const SkPMColor* SK_RESTRICT src, 70 const SkPMColor* SK_RESTRICT src,
71 int count, U8CPU alpha) { 71 int count, U8CPU alpha) {
72 SkASSERT(alpha == 255); 72 SkASSERT(alpha == 255);
73 if (count <= 0) { 73 if (count <= 0) {
74 return; 74 return;
75 } 75 }
76 76
77 #ifdef SK_USE_ACCURATE_BLENDING
77 if (count >= 4) { 78 if (count >= 4) {
78 SkASSERT(((size_t)dst & 0x03) == 0); 79 SkASSERT(((size_t)dst & 0x03) == 0);
79 while (((size_t)dst & 0x0F) != 0) { 80 while (((size_t)dst & 0x0F) != 0) {
80 *dst = SkPMSrcOver(*src, *dst); 81 *dst = SkPMSrcOver(*src, *dst);
81 src++; 82 src++;
82 dst++; 83 dst++;
83 count--; 84 count--;
84 } 85 }
85 86
86 const __m128i *s = reinterpret_cast<const __m128i*>(src); 87 const __m128i *s = reinterpret_cast<const __m128i*>(src);
87 __m128i *d = reinterpret_cast<__m128i*>(dst); 88 __m128i *d = reinterpret_cast<__m128i*>(dst);
88 #ifdef SK_USE_ACCURATE_BLENDING
89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
92 while (count >= 4) { 92 while (count >= 4) {
93 // Load 4 pixels 93 // Load 4 pixels
94 __m128i src_pixel = _mm_loadu_si128(s); 94 __m128i src_pixel = _mm_loadu_si128(s);
95 __m128i dst_pixel = _mm_load_si128(d); 95 __m128i dst_pixel = _mm_load_si128(d);
96 96
97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
(...skipping 28 matching lines...) Expand all
127 // Combine back into RGBA. 127 // Combine back into RGBA.
128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 128 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
129 129
130 // Add result 130 // Add result
131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
132 _mm_store_si128(d, result); 132 _mm_store_si128(d, result);
133 s++; 133 s++;
134 d++; 134 d++;
135 count -= 4; 135 count -= 4;
136 } 136 }
137 #else
138 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
139 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
140 while (count >= 4) {
141 // Load 4 pixels
142 __m128i src_pixel = _mm_loadu_si128(s);
143 __m128i dst_pixel = _mm_load_si128(d);
144
145 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
146 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
147
148 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
149 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
150
151 // (a0, a0, a1, a1, a2, g2, a3, g3)
152 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
153
154 // (a0, a0, a1, a1, a2, a2, a3, a3)
155 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
156
157 // Subtract alphas from 256, to get 1..256
158 alpha = _mm_sub_epi16(c_256, alpha);
159
160 // Multiply by red and blue by src alpha.
161 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
162 // Multiply by alpha and green by src alpha.
163 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
164
165 // Divide by 256.
166 dst_rb = _mm_srli_epi16(dst_rb, 8);
167
168 // Mask out high bits (already in the right place)
169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
170
171 // Combine back into RGBA.
172 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
173
174 // Add result
175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
176 _mm_store_si128(d, result);
177 s++;
178 d++;
179 count -= 4;
180 }
181 #endif
182 src = reinterpret_cast<const SkPMColor*>(s); 137 src = reinterpret_cast<const SkPMColor*>(s);
183 dst = reinterpret_cast<SkPMColor*>(d); 138 dst = reinterpret_cast<SkPMColor*>(d);
184 } 139 }
185 140
186 while (count > 0) { 141 while (count > 0) {
187 *dst = SkPMSrcOver(*src, *dst); 142 *dst = SkPMSrcOver(*src, *dst);
188 src++; 143 src++;
189 dst++; 144 dst++;
190 count--; 145 count--;
191 } 146 }
147 #else
148 int count16 = count / 16;
149 __m128i* dst4 = (__m128i*)dst;
150 const __m128i* src4 = (const __m128i*)src;
151
152 for (int i = 0; i < count16 * 4; i += 4) {
153 // Load 16 source pixels.
154 __m128i s0 = _mm_loadu_si128(src4+i+0),
155 s1 = _mm_loadu_si128(src4+i+1),
156 s2 = _mm_loadu_si128(src4+i+2),
157 s3 = _mm_loadu_si128(src4+i+3);
158
159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero _si128());
162 if (0xffff == _mm_movemask_epi8(cmp)) {
163 // All 16 source pixels are fully transparent. There's nothing to do !
164 continue;
165 }
166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128( s1, s0)));
167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
168 if (0xffff == _mm_movemask_epi8(cmp)) {
169 // All 16 source pixels are fully opaque. There's no need to read ds t or blend it.
170 _mm_storeu_si128(dst4+i+0, s0);
171 _mm_storeu_si128(dst4+i+1, s1);
172 _mm_storeu_si128(dst4+i+2, s2);
173 _mm_storeu_si128(dst4+i+3, s3);
174 continue;
175 }
176 // The general slow case: do the blend for all 16 pixels.
177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0 )));
178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1 )));
179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2 )));
180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3 )));
181 }
182
183 // Wrap up the last <= 15 pixels.
184 SkASSERT(count - (count16*16) <= 15);
185 for (int i = count16*16; i < count; i++) {
186 // This check is not really necessarily, but it prevents pointless autov ectorization.
187 if (src[i] & 0xFF000000) {
188 dst[i] = SkPMSrcOver(src[i], dst[i]);
189 }
190 }
191 #endif
192 } 192 }
193 193
194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
195 const SkPMColor* SK_RESTRICT src, 195 const SkPMColor* SK_RESTRICT src,
196 int count, U8CPU alpha) { 196 int count, U8CPU alpha) {
197 SkASSERT(alpha <= 255); 197 SkASSERT(alpha <= 255);
198 if (count <= 0) { 198 if (count <= 0) {
199 return; 199 return;
200 } 200 }
201 201
(...skipping 944 matching lines...) Expand 10 before | Expand all | Expand 10 after
1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1148 // now src and dst expanded are in g:11 r:10 x:1 b:10 1148 // now src and dst expanded are in g:11 r:10 x:1 b:10
1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1150 } 1150 }
1151 dst += 1; 1151 dst += 1;
1152 DITHER_INC_X(x); 1152 DITHER_INC_X(x);
1153 } while (--count != 0); 1153 } while (--count != 0);
1154 } 1154 }
1155 } 1155 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698