third_party/libwebp/dsp/alpha_processing_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/alpha_processing_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 Google Inc. All Rights Reserved.	1 // Copyright 2014 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // Utilities for processing transparent channel.	10 // Utilities for processing transparent channel.

(...skipping 132 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
143 return (alpha_and == 0xff);	143 return (alpha_and == 0xff);

144 }	144 }

145	145

146 //------------------------------------------------------------------------------	146 //------------------------------------------------------------------------------

147 // Non-dither premultiplied modes	147 // Non-dither premultiplied modes

148	148

149 #define MULTIPLIER(a) ((a) * 0x8081)	149 #define MULTIPLIER(a) ((a) * 0x8081)

150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)	150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)

151	151

152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an	152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an

153 // immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.	153 // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.

154 #define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \	154 // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit

155 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \	155 // value.

156 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \	156 #define APPLY_ALPHA(RGBX, SHUFFLE) do { \

157 const __m128i alpha0 = _mm_and_si128(argb1, MASK); \	157 const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \

158 const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \	158 const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \

159 const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \	159 const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \

160 /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \	160 const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \

161 const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \	161 const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \

162 const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \	162 const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \

163 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \	163 const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \

164 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \	164 const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \

165 const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \	165 const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \

166 const __m128i argb5 = _mm_srli_epi16(argb4, 7); \	166 /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \

167 const __m128i argb6 = _mm_or_si128(argb5, alpha0); \	167 const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \

168 const __m128i argb7 = _mm_packus_epi16(argb6, zero); \	168 const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \

169 _mm_storel_epi64((__m128i*)&(RGBX), argb7); \	169 const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \

	170 const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \

	171 const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \

	172 const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \

	173 const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \

	174 _mm_storeu_si128((__m128i*)&(RGBX), A3); \

170 } while (0)	175 } while (0)

171	176

172 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,	177 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,

173 int w, int h, int stride) {	178 int w, int h, int stride) {

174 const __m128i zero = _mm_setzero_si128();	179 const __m128i zero = _mm_setzero_si128();

175 const int kSpan = 2;	180 const __m128i kMult = _mm_set1_epi16(0x8081u);

176 const int w2 = w & ~(kSpan - 1);	181 const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);

	182 const int kSpan = 4;

177 while (h-- > 0) {	183 while (h-- > 0) {

178 uint32_t* const rgbx = (uint32_t*)rgba;	184 uint32_t* const rgbx = (uint32_t*)rgba;

179 int i;	185 int i;

180 if (!alpha_first) {	186 if (!alpha_first) {

181 const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);	187 for (i = 0; i + kSpan <= w; i += kSpan) {

182 const __m128i kMult =	188 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));

183 _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);

184 for (i = 0; i < w2; i += kSpan) {

185 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);

186 }	189 }

187 } else {	190 } else {

188 const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);	191 for (i = 0; i + kSpan <= w; i += kSpan) {

189 const __m128i kMult =	192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));

190 _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);

191 for (i = 0; i < w2; i += kSpan) {

192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);

193 }	193 }

194 }	194 }

195 // Finish with left-overs.	195 // Finish with left-overs.

196 for (; i < w; ++i) {	196 for (; i < w; ++i) {

197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);	197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);

198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);	198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);

199 const uint32_t a = alpha[4 * i];	199 const uint32_t a = alpha[4 * i];

200 if (a != 0xff) {	200 if (a != 0xff) {

201 const uint32_t mult = MULTIPLIER(a);	201 const uint32_t mult = MULTIPLIER(a);

202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);	202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);

203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);	203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);

204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);	204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);

205 }	205 }

206 }	206 }

207 rgba += stride;	207 rgba += stride;

208 }	208 }

209 }	209 }

210 #undef MULTIPLIER	210 #undef MULTIPLIER

211 #undef PREMULTIPLY	211 #undef PREMULTIPLY

212	212

213 // -----------------------------------------------------------------------------	213 // -----------------------------------------------------------------------------

214 // Apply alpha value to rows	214 // Apply alpha value to rows

215	215

216 // We use: kINV255 = (1 << 24) / 255 = 0x010101	216 static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {

217 // So: a * kINV255 = (a << 16) \| [(a << 8) \| a]

218 // -> _mm_mulhi_epu16() takes care of the (a<<16) part,

219 // and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) \| a" one.

220

221 static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {

222 int x = 0;	217 int x = 0;

223 if (!inverse) {	218 if (!inverse) {

224 const int kSpan = 2;	219 const int kSpan = 2;

225 const __m128i zero = _mm_setzero_si128();	220 const __m128i zero = _mm_setzero_si128();

226 const __m128i kRound =	221 const __m128i k128 = _mm_set1_epi16(128);

227 _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);	222 const __m128i kMult = _mm_set1_epi16(0x0101);

228 const __m128i kMult =	223 const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);

229 _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);	224 for (x = 0; x + kSpan <= width; x += kSpan) {

230 const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);	225 // To compute 'result = (int)(a * x / 255. + .5)', we use:

231 const int w2 = width & ~(kSpan - 1);	226 // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16

232 for (x = 0; x < w2; x += kSpan) {	227 const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);

233 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);	228 const __m128i A1 = _mm_unpacklo_epi8(A0, zero);

234 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);	229 const __m128i A2 = _mm_or_si128(A1, kMask);

235 const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));	230 const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));

236 const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));	231 const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));

237 const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);	232 // here, A4 = [ff a0 a0 a0][ff a1 a1 a1]

238 const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);	233 const __m128i A5 = _mm_mullo_epi16(A4, A1);

239 const __m128i scale1 = _mm_or_si128(tmp2, kOne64);	234 const __m128i A6 = _mm_add_epi16(A5, k128);

240 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);	235 const __m128i A7 = _mm_mulhi_epu16(A6, kMult);

241 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);	236 const __m128i A10 = _mm_packus_epi16(A7, zero);

242 const __m128i argb4 = _mm_adds_epu16(argb2, argb3);	237 _mm_storel_epi64((__m128i*)&ptr[x], A10);

243 const __m128i argb5 = _mm_adds_epu16(argb4, kRound);

244 const __m128i argb6 = _mm_srli_epi16(argb5, 8);

245 const __m128i argb7 = _mm_packus_epi16(argb6, zero);

246 _mm_storel_epi64((__m128i*)&ptr[x], argb7);

247 }	238 }

248 }	239 }

249 width -= x;	240 width -= x;

250 if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);	241 if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);

251 }	242 }

252	243

253 static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,	244 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,

254 int width, int inverse) {	245 int width, int inverse) {

255 int x = 0;	246 int x = 0;

256 if (!inverse) {	247 if (!inverse) {

257 const int kSpan = 8;

258 const __m128i zero = _mm_setzero_si128();	248 const __m128i zero = _mm_setzero_si128();

259 const __m128i kRound = _mm_set1_epi16(1 << 7);	249 const __m128i k128 = _mm_set1_epi16(128);

260 const int w2 = width & ~(kSpan - 1);	250 const __m128i kMult = _mm_set1_epi16(0x0101);

261 for (x = 0; x < w2; x += kSpan) {	251 for (x = 0; x + 8 <= width; x += 8) {

262 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);	252 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);

	253 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);

263 const __m128i v1 = _mm_unpacklo_epi8(v0, zero);	254 const __m128i v1 = _mm_unpacklo_epi8(v0, zero);

264 const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);	255 const __m128i a1 = _mm_unpacklo_epi8(a0, zero);

265 const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);	256 const __m128i v2 = _mm_mullo_epi16(v1, a1);

266 const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);	257 const __m128i v3 = _mm_add_epi16(v2, k128);

267 const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);	258 const __m128i v4 = _mm_mulhi_epu16(v3, kMult);

268 const __m128i v3 = _mm_mullo_epi16(v1, alpha1);	259 const __m128i v5 = _mm_packus_epi16(v4, zero);

269 const __m128i v4 = _mm_adds_epu16(v2, v3);	260 _mm_storel_epi64((__m128i*)&ptr[x], v5);

270 const __m128i v5 = _mm_adds_epu16(v4, kRound);

271 const __m128i v6 = _mm_srli_epi16(v5, 8);

272 const __m128i v7 = _mm_packus_epi16(v6, zero);

273 _mm_storel_epi64((__m128i*)&ptr[x], v7);

274 }	261 }

275 }	262 }

276 width -= x;	263 width -= x;

277 if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);	264 if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);

278 }	265 }

279	266

280 //------------------------------------------------------------------------------	267 //------------------------------------------------------------------------------

281 // Entry point	268 // Entry point

282	269

283 extern void WebPInitAlphaProcessingSSE2(void);	270 extern void WebPInitAlphaProcessingSSE2(void);

284	271

285 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {	272 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {

286 WebPMultARGBRow = MultARGBRow;	273 WebPMultARGBRow = MultARGBRow_SSE2;

287 WebPMultRow = MultRow;	274 WebPMultRow = MultRow_SSE2;

288 WebPApplyAlphaMultiply = ApplyAlphaMultiply;	275 WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;

289 WebPDispatchAlpha = DispatchAlpha;	276 WebPDispatchAlpha = DispatchAlpha;

290 WebPDispatchAlphaToGreen = DispatchAlphaToGreen;	277 WebPDispatchAlphaToGreen = DispatchAlphaToGreen;

291 WebPExtractAlpha = ExtractAlpha;	278 WebPExtractAlpha = ExtractAlpha;

292 }	279 }

293	280

294 #else // !WEBP_USE_SSE2	281 #else // !WEBP_USE_SSE2

295	282

296 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)	283 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)

297	284

298 #endif // WEBP_USE_SSE2	285 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/alpha_processing_neon.c ('k') | third_party/libwebp/dsp/common_sse2.h » ('j') | no next file with comments »