src/opts/SkBlitRow_opts_SSE2.cpp - Issue 886403002: Optimize SSE2 opaque blend

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 886403002: Optimize SSE2 opaque blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include <emmintrin.h>	8 #include <emmintrin.h>

9 #include "SkBitmapProcState_opts_SSE2.h"	9 #include "SkBitmapProcState_opts_SSE2.h"

10 #include "SkBlitRow_opts_SSE2.h"	10 #include "SkBlitRow_opts_SSE2.h"

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
67 }	67 }

68	68

69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,	69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

70 const SkPMColor* SK_RESTRICT src,	70 const SkPMColor* SK_RESTRICT src,

71 int count, U8CPU alpha) {	71 int count, U8CPU alpha) {

72 SkASSERT(alpha == 255);	72 SkASSERT(alpha == 255);

73 if (count <= 0) {	73 if (count <= 0) {

74 return;	74 return;

75 }	75 }

76	76

	77 #ifdef SK_USE_ACCURATE_BLENDING

77 if (count >= 4) {	78 if (count >= 4) {

78 SkASSERT(((size_t)dst & 0x03) == 0);	79 SkASSERT(((size_t)dst & 0x03) == 0);

79 while (((size_t)dst & 0x0F) != 0) {	80 while (((size_t)dst & 0x0F) != 0) {

80 dst = SkPMSrcOver(src, *dst);	81 dst = SkPMSrcOver(src, *dst);

81 src++;	82 src++;

82 dst++;	83 dst++;

83 count--;	84 count--;

84 }	85 }

85	86

86 const __m128i s = reinterpret_cast<const __m128i>(src);	87 const __m128i s = reinterpret_cast<const __m128i>(src);

87 __m128i d = reinterpret_cast<__m128i>(dst);	88 __m128i d = reinterpret_cast<__m128i>(dst);

88 #ifdef SK_USE_ACCURATE_BLENDING

89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);	89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)	90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)

91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)	91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)

92 while (count >= 4) {	92 while (count >= 4) {

93 // Load 4 pixels	93 // Load 4 pixels

94 __m128i src_pixel = _mm_loadu_si128(s);	94 __m128i src_pixel = _mm_loadu_si128(s);

95 __m128i dst_pixel = _mm_load_si128(d);	95 __m128i dst_pixel = _mm_load_si128(d);

96	96

97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);	97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);	98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

(...skipping 28 matching lines...) Expand all Loading...
127 // Combine back into RGBA.	127 // Combine back into RGBA.

128 dst_pixel = _mm_or_si128(dst_rb, dst_ag);	128 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

129	129

130 // Add result	130 // Add result

131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);	131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

132 _mm_store_si128(d, result);	132 _mm_store_si128(d, result);

133 s++;	133 s++;

134 d++;	134 d++;

135 count -= 4;	135 count -= 4;

136 }	136 }

137 #else

138 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

139 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)

140 while (count >= 4) {

141 // Load 4 pixels

142 __m128i src_pixel = _mm_loadu_si128(s);

143 __m128i dst_pixel = _mm_load_si128(d);

144

145 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

146 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

147

148 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)

149 __m128i alpha = _mm_srli_epi16(src_pixel, 8);

150

151 // (a0, a0, a1, a1, a2, g2, a3, g3)

152 alpha = _mm_shufflehi_epi16(alpha, 0xF5);

153

154 // (a0, a0, a1, a1, a2, a2, a3, a3)

155 alpha = _mm_shufflelo_epi16(alpha, 0xF5);

156

157 // Subtract alphas from 256, to get 1..256

158 alpha = _mm_sub_epi16(c_256, alpha);

159

160 // Multiply by red and blue by src alpha.

161 dst_rb = _mm_mullo_epi16(dst_rb, alpha);

162 // Multiply by alpha and green by src alpha.

163 dst_ag = _mm_mullo_epi16(dst_ag, alpha);

164

165 // Divide by 256.

166 dst_rb = _mm_srli_epi16(dst_rb, 8);

167

168 // Mask out high bits (already in the right place)

169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

170

171 // Combine back into RGBA.

172 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

173

174 // Add result

175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

176 _mm_store_si128(d, result);

177 s++;

178 d++;

179 count -= 4;

180 }

181 #endif

182 src = reinterpret_cast<const SkPMColor*>(s);	137 src = reinterpret_cast<const SkPMColor*>(s);

183 dst = reinterpret_cast<SkPMColor*>(d);	138 dst = reinterpret_cast<SkPMColor*>(d);

184 }	139 }

185	140

186 while (count > 0) {	141 while (count > 0) {

187 dst = SkPMSrcOver(src, *dst);	142 dst = SkPMSrcOver(src, *dst);

188 src++;	143 src++;

189 dst++;	144 dst++;

190 count--;	145 count--;

191 }	146 }

	147 #else

	148 int count16 = count / 16;

	149 __m128i* dst4 = (__m128i*)dst;

	150 const __m128i* src4 = (const __m128i*)src;

	151

	152 for (int i = 0; i < count16 * 4; i += 4) {

	153 // Load 16 source pixels.

	154 __m128i s0 = _mm_loadu_si128(src4+i+0),

	155 s1 = _mm_loadu_si128(src4+i+1),

	156 s2 = _mm_loadu_si128(src4+i+2),

	157 s3 = _mm_loadu_si128(src4+i+3);

	158

	159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);

	160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));

	161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero _si128());

	162 if (0xffff == _mm_movemask_epi8(cmp)) {

	163 // All 16 source pixels are fully transparent. There's nothing to do !

	164 continue;

	165 }

	166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128( s1, s0)));

	167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);

	168 if (0xffff == _mm_movemask_epi8(cmp)) {

	169 // All 16 source pixels are fully opaque. There's no need to read ds t or blend it.

	170 _mm_storeu_si128(dst4+i+0, s0);

	171 _mm_storeu_si128(dst4+i+1, s1);

	172 _mm_storeu_si128(dst4+i+2, s2);

	173 _mm_storeu_si128(dst4+i+3, s3);

	174 continue;

	175 }

	176 // The general slow case: do the blend for all 16 pixels.

	177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0 )));

	178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1 )));

	179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2 )));

	180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3 )));

	181 }

	182

	183 // Wrap up the last <= 15 pixels.

	184 SkASSERT(count - (count16*16) <= 15);

	185 for (int i = count16*16; i < count; i++) {

	186 // This check is not really necessarily, but it prevents pointless autov ectorization.

	187 if (src[i] & 0xFF000000) {

	188 dst[i] = SkPMSrcOver(src[i], dst[i]);

	189 }

	190 }

	191 #endif

192 }	192 }

193	193

194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,	194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

195 const SkPMColor* SK_RESTRICT src,	195 const SkPMColor* SK_RESTRICT src,

196 int count, U8CPU alpha) {	196 int count, U8CPU alpha) {

197 SkASSERT(alpha <= 255);	197 SkASSERT(alpha <= 255);

198 if (count <= 0) {	198 if (count <= 0) {

199 return;	199 return;

200 }	200 }

201	201

(...skipping 944 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst);	1146 uint32_t dst_expanded = SkExpand_rgb_16(*dst);

1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);	1147 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

1148 // now src and dst expanded are in g:11 r:10 x:1 b:10	1148 // now src and dst expanded are in g:11 r:10 x:1 b:10

1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);	1149 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

1150 }	1150 }

1151 dst += 1;	1151 dst += 1;

1152 DITHER_INC_X(x);	1152 DITHER_INC_X(x);

1153 } while (--count != 0);	1153 } while (--count != 0);

1154 }	1154 }

1155 }	1155 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »