third_party/libwebp/dsp/yuv_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/yuv_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 Google Inc. All Rights Reserved.	1 // Copyright 2014 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // YUV->RGB conversion functions	10 // YUV->RGB conversion functions

11 //	11 //

12 // Author: Skal (pascal.massimino@gmail.com)	12 // Author: Skal (pascal.massimino@gmail.com)

13	13

14 #include "./yuv.h"	14 #include "./yuv.h"

15	15

16 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

17	17

	18 #include "./common_sse2.h"

	19 #include <stdlib.h>

18 #include <emmintrin.h>	20 #include <emmintrin.h>

19	21

20 //-----------------------------------------------------------------------------	22 //-----------------------------------------------------------------------------

21 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.	23 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.

22	24

23 // These constants are 14b fixed-point version of ITU-R BT.601 constants.	25 // These constants are 14b fixed-point version of ITU-R BT.601 constants.

24 // R = (19077 * y + 26149 * v - 14234) >> 6	26 // R = (19077 * y + 26149 * v - 14234) >> 6

25 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6	27 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6

26 // B = (19077 * y + 33050 * u - 17685) >> 6	28 // B = (19077 * y + 33050 * u - 17685) >> 6

27 static void ConvertYUV444ToRGB(const __m128i* const Y0,	29 static void ConvertYUV444ToRGB(const __m128i* const Y0,

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
148 const __m128i rg = _mm_or_si128(r1, g1);	150 const __m128i rg = _mm_or_si128(r1, g1);

149 const __m128i gb = _mm_or_si128(g2, b1);	151 const __m128i gb = _mm_or_si128(g2, b1);

150 #if !defined(WEBP_SWAP_16BIT_CSP)	152 #if !defined(WEBP_SWAP_16BIT_CSP)

151 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);	153 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);

152 #else	154 #else

153 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);	155 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);

154 #endif	156 #endif

155 _mm_storeu_si128((__m128i*)dst, rgb565);	157 _mm_storeu_si128((__m128i*)dst, rgb565);

156 }	158 }

157	159

158 // Function used several times in PlanarTo24b.

159 // It samples the in buffer as follows: one every two unsigned char is stored

160 // at the beginning of the buffer, while the other half is stored at the end.

161 static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /in[6]/,

162 __m128i* const out /out[6]/) {

163 const __m128i v_mask = _mm_set1_epi16(0x00ff);

164

165 // Take one every two upper 8b values.

166 out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),

167 _mm_and_si128(in[1], v_mask));

168 out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),

169 _mm_and_si128(in[3], v_mask));

170 out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),

171 _mm_and_si128(in[5], v_mask));

172 // Take one every two lower 8b values.

173 out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));

174 out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));

175 out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));

176 }

177

178 // Pack the planar buffers	160 // Pack the planar buffers

179 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....	161 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....

180 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...	162 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...

181 static WEBP_INLINE void PlanarTo24b(__m128i* const in /in[6]/, uint8_t* rgb) {	163 static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,

	164 __m128i* const in2, __m128i* const in3,

	165 __m128i* const in4, __m128i* const in5,

	166 uint8_t* const rgb) {

182 // The input is 6 registers of sixteen 8b but for the sake of explanation,	167 // The input is 6 registers of sixteen 8b but for the sake of explanation,

183 // let's take 6 registers of four 8b values.	168 // let's take 6 registers of four 8b values.

184 // To pack, we will keep taking one every two 8b integer and move it	169 // To pack, we will keep taking one every two 8b integer and move it

185 // around as follows:	170 // around as follows:

186 // Input:	171 // Input:

187 // r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7	172 // r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7

188 // Split the 6 registers in two sets of 3 registers: the first set as the even	173 // Split the 6 registers in two sets of 3 registers: the first set as the even

189 // 8b bytes, the second the odd ones:	174 // 8b bytes, the second the odd ones:

190 // r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7	175 // r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7

191 // Repeat the same permutations twice more:	176 // Repeat the same permutations twice more:

192 // r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7	177 // r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7

193 // r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7	178 // r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7

194 __m128i tmp[6];	179 VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);

195 PlanarTo24bHelper(in, tmp);

196 PlanarTo24bHelper(tmp, in);

197 PlanarTo24bHelper(in, tmp);

198 // We need to do it two more times than the example as we have sixteen bytes.

199 PlanarTo24bHelper(tmp, in);

200 PlanarTo24bHelper(in, tmp);

201	180

202 _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]);	181 _mm_storeu_si128((__m128i)(rgb + 0), in0);

203 _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);	182 _mm_storeu_si128((__m128i)(rgb + 16), in1);

204 _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);	183 _mm_storeu_si128((__m128i)(rgb + 32), in2);

205 _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);	184 _mm_storeu_si128((__m128i)(rgb + 48), in3);

206 _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);	185 _mm_storeu_si128((__m128i)(rgb + 64), in4);

207 _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);	186 _mm_storeu_si128((__m128i)(rgb + 80), in5);

208 }	187 }

209 #undef MK_UINT32

210	188

211 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,	189 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

212 uint8_t* dst) {	190 uint8_t* dst) {

213 const __m128i kAlpha = _mm_set1_epi16(255);	191 const __m128i kAlpha = _mm_set1_epi16(255);

214 int n;	192 int n;

215 for (n = 0; n < 32; n += 8, dst += 32) {	193 for (n = 0; n < 32; n += 8, dst += 32) {

216 __m128i R, G, B;	194 __m128i R, G, B;

217 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);	195 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);

218 PackAndStore4(&R, &G, &B, &kAlpha, dst);	196 PackAndStore4(&R, &G, &B, &kAlpha, dst);

219 }	197 }

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
258 for (n = 0; n < 32; n += 8, dst += 16) {	236 for (n = 0; n < 32; n += 8, dst += 16) {

259 __m128i R, G, B;	237 __m128i R, G, B;

260 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);	238 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);

261 PackAndStore565(&R, &G, &B, dst);	239 PackAndStore565(&R, &G, &B, dst);

262 }	240 }

263 }	241 }

264	242

265 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,	243 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

266 uint8_t* dst) {	244 uint8_t* dst) {

267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;	245 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

268 __m128i rgb[6];	246 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

269	247

270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);	248 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);	249 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);

272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);	250 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);

273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);	251 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);

274	252

275 // Cast to 8b and store as RRRRGGGGBBBB.	253 // Cast to 8b and store as RRRRGGGGBBBB.

276 rgb[0] = _mm_packus_epi16(R0, R1);	254 rgb0 = _mm_packus_epi16(R0, R1);

277 rgb[1] = _mm_packus_epi16(R2, R3);	255 rgb1 = _mm_packus_epi16(R2, R3);

278 rgb[2] = _mm_packus_epi16(G0, G1);	256 rgb2 = _mm_packus_epi16(G0, G1);

279 rgb[3] = _mm_packus_epi16(G2, G3);	257 rgb3 = _mm_packus_epi16(G2, G3);

280 rgb[4] = _mm_packus_epi16(B0, B1);	258 rgb4 = _mm_packus_epi16(B0, B1);

281 rgb[5] = _mm_packus_epi16(B2, B3);	259 rgb5 = _mm_packus_epi16(B2, B3);

282	260

283 // Pack as RGBRGBRGBRGB.	261 // Pack as RGBRGBRGBRGB.

284 PlanarTo24b(rgb, dst);	262 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);

285 }	263 }

286	264

287 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,	265 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

288 uint8_t* dst) {	266 uint8_t* dst) {

289 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;	267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

290 __m128i bgr[6];	268 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

291	269

292 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);	270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

293 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);	271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);

294 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);	272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);

295 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);	273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);

296	274

297 // Cast to 8b and store as BBBBGGGGRRRR.	275 // Cast to 8b and store as BBBBGGGGRRRR.

298 bgr[0] = _mm_packus_epi16(B0, B1);	276 bgr0 = _mm_packus_epi16(B0, B1);

299 bgr[1] = _mm_packus_epi16(B2, B3);	277 bgr1 = _mm_packus_epi16(B2, B3);

300 bgr[2] = _mm_packus_epi16(G0, G1);	278 bgr2 = _mm_packus_epi16(G0, G1);

301 bgr[3] = _mm_packus_epi16(G2, G3);	279 bgr3 = _mm_packus_epi16(G2, G3);

302 bgr[4] = _mm_packus_epi16(R0, R1);	280 bgr4 = _mm_packus_epi16(R0, R1);

303 bgr[5] = _mm_packus_epi16(R2, R3);	281 bgr5= _mm_packus_epi16(R2, R3);

304	282

305 // Pack as BGRBGRBGRBGR.	283 // Pack as BGRBGRBGRBGR.

306 PlanarTo24b(bgr, dst);	284 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);

307 }	285 }

308	286

309 //-----------------------------------------------------------------------------	287 //-----------------------------------------------------------------------------

310 // Arbitrary-length row conversion functions	288 // Arbitrary-length row conversion functions

311	289

312 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,	290 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

313 uint8_t* dst, int len) {	291 uint8_t* dst, int len) {

314 const __m128i kAlpha = _mm_set1_epi16(255);	292 const __m128i kAlpha = _mm_set1_epi16(255);

315 int n;	293 int n;

316 for (n = 0; n + 8 <= len; n += 8, dst += 32) {	294 for (n = 0; n + 8 <= len; n += 8, dst += 32) {

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
370 u += (n & 1);	348 u += (n & 1);

371 v += (n & 1);	349 v += (n & 1);

372 }	350 }

373 }	351 }

374	352

375 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,	353 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

376 uint8_t* dst, int len) {	354 uint8_t* dst, int len) {

377 int n;	355 int n;

378 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {	356 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {

379 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;	357 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

380 __m128i rgb[6];	358 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

381	359

382 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);	360 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

383 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);	361 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);

384 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);	362 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);

385 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);	363 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);

386	364

387 // Cast to 8b and store as RRRRGGGGBBBB.	365 // Cast to 8b and store as RRRRGGGGBBBB.

388 rgb[0] = _mm_packus_epi16(R0, R1);	366 rgb0 = _mm_packus_epi16(R0, R1);

389 rgb[1] = _mm_packus_epi16(R2, R3);	367 rgb1 = _mm_packus_epi16(R2, R3);

390 rgb[2] = _mm_packus_epi16(G0, G1);	368 rgb2 = _mm_packus_epi16(G0, G1);

391 rgb[3] = _mm_packus_epi16(G2, G3);	369 rgb3 = _mm_packus_epi16(G2, G3);

392 rgb[4] = _mm_packus_epi16(B0, B1);	370 rgb4 = _mm_packus_epi16(B0, B1);

393 rgb[5] = _mm_packus_epi16(B2, B3);	371 rgb5 = _mm_packus_epi16(B2, B3);

394	372

395 // Pack as RGBRGBRGBRGB.	373 // Pack as RGBRGBRGBRGB.

396 PlanarTo24b(rgb, dst);	374 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);

397	375

398 y += 32;	376 y += 32;

399 u += 16;	377 u += 16;

400 v += 16;	378 v += 16;

401 }	379 }

402 for (; n < len; ++n) { // Finish off	380 for (; n < len; ++n) { // Finish off

403 VP8YuvToRgb(y[0], u[0], v[0], dst);	381 VP8YuvToRgb(y[0], u[0], v[0], dst);

404 dst += 3;	382 dst += 3;

405 y += 1;	383 y += 1;

406 u += (n & 1);	384 u += (n & 1);

407 v += (n & 1);	385 v += (n & 1);

408 }	386 }

409 }	387 }

410	388

411 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,	389 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

412 uint8_t* dst, int len) {	390 uint8_t* dst, int len) {

413 int n;	391 int n;

414 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {	392 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {

415 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;	393 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

416 __m128i bgr[6];	394 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

417	395

418 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);	396 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

419 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);	397 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);

420 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);	398 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);

421 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);	399 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);

422	400

423 // Cast to 8b and store as BBBBGGGGRRRR.	401 // Cast to 8b and store as BBBBGGGGRRRR.

424 bgr[0] = _mm_packus_epi16(B0, B1);	402 bgr0 = _mm_packus_epi16(B0, B1);

425 bgr[1] = _mm_packus_epi16(B2, B3);	403 bgr1 = _mm_packus_epi16(B2, B3);

426 bgr[2] = _mm_packus_epi16(G0, G1);	404 bgr2 = _mm_packus_epi16(G0, G1);

427 bgr[3] = _mm_packus_epi16(G2, G3);	405 bgr3 = _mm_packus_epi16(G2, G3);

428 bgr[4] = _mm_packus_epi16(R0, R1);	406 bgr4 = _mm_packus_epi16(R0, R1);

429 bgr[5] = _mm_packus_epi16(R2, R3);	407 bgr5 = _mm_packus_epi16(R2, R3);

430	408

431 // Pack as BGRBGRBGRBGR.	409 // Pack as BGRBGRBGRBGR.

432 PlanarTo24b(bgr, dst);	410 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);

433	411

434 y += 32;	412 y += 32;

435 u += 16;	413 u += 16;

436 v += 16;	414 v += 16;

437 }	415 }

438 for (; n < len; ++n) { // Finish off	416 for (; n < len; ++n) { // Finish off

439 VP8YuvToBgr(y[0], u[0], v[0], dst);	417 VP8YuvToBgr(y[0], u[0], v[0], dst);

440 dst += 3;	418 dst += 3;

441 y += 1;	419 y += 1;

442 u += (n & 1);	420 u += (n & 1);

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
492	470

493 RGB24PackedToPlanarHelper(tmp, out);	471 RGB24PackedToPlanarHelper(tmp, out);

494 RGB24PackedToPlanarHelper(out, tmp);	472 RGB24PackedToPlanarHelper(out, tmp);

495 RGB24PackedToPlanarHelper(tmp, out);	473 RGB24PackedToPlanarHelper(tmp, out);

496 RGB24PackedToPlanarHelper(out, tmp);	474 RGB24PackedToPlanarHelper(out, tmp);

497 RGB24PackedToPlanarHelper(tmp, out);	475 RGB24PackedToPlanarHelper(tmp, out);

498 }	476 }

499	477

500 // Convert 8 packed ARGB to r[], g[], b[]	478 // Convert 8 packed ARGB to r[], g[], b[]

501 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,	479 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,

502 __m128i* const r,	480 __m128i* const rgb /in[6]/) {

503 __m128i* const g,

504 __m128i* const b) {

505 const __m128i zero = _mm_setzero_si128();	481 const __m128i zero = _mm_setzero_si128();

506 const __m128i in0 = LOAD_16(argb + 0); // argb3 \| argb2 \| argb1 \| argb0	482 __m128i a0 = LOAD_16(argb + 0);

507 const __m128i in1 = LOAD_16(argb + 4); // argb7 \| argb6 \| argb5 \| argb4	483 __m128i a1 = LOAD_16(argb + 4);

508 // column-wise transpose	484 __m128i a2 = LOAD_16(argb + 8);

509 const __m128i A0 = _mm_unpacklo_epi8(in0, in1);	485 __m128i a3 = LOAD_16(argb + 12);

510 const __m128i A1 = _mm_unpackhi_epi8(in0, in1);	486 VP8L32bToPlanar(&a0, &a1, &a2, &a3);

511 const __m128i B0 = _mm_unpacklo_epi8(A0, A1);	487 rgb[0] = _mm_unpacklo_epi8(a1, zero);

512 const __m128i B1 = _mm_unpackhi_epi8(A0, A1);	488 rgb[1] = _mm_unpackhi_epi8(a1, zero);

513 // C0 = g7 g6 ... g1 g0 \| b7 b6 ... b1 b0	489 rgb[2] = _mm_unpacklo_epi8(a2, zero);

514 // C1 = a7 a6 ... a1 a0 \| r7 r6 ... r1 r0	490 rgb[3] = _mm_unpackhi_epi8(a2, zero);

515 const __m128i C0 = _mm_unpacklo_epi8(B0, B1);	491 rgb[4] = _mm_unpacklo_epi8(a3, zero);

516 const __m128i C1 = _mm_unpackhi_epi8(B0, B1);	492 rgb[5] = _mm_unpackhi_epi8(a3, zero);

517 // store 16b

518 *r = _mm_unpacklo_epi8(C1, zero);

519 *g = _mm_unpackhi_epi8(C0, zero);

520 *b = _mm_unpacklo_epi8(C0, zero);

521 }	493 }

522	494

523 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX	495 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX

524 // It's a macro and not a function because we need to use immediate values with	496 // It's a macro and not a function because we need to use immediate values with

525 // srai_epi32, e.g.	497 // srai_epi32, e.g.

526 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \	498 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \

527 ROUNDER, DESCALE_FIX, OUT) do { \	499 ROUNDER, DESCALE_FIX, OUT) do { \

528 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \	500 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \

529 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \	501 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \

530 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \	502 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \

(...skipping 111 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
642 }	614 }

643 for (; i < width; ++i, bgr += 3) { // left-over	615 for (; i < width; ++i, bgr += 3) { // left-over

644 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);	616 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);

645 }	617 }

646 }	618 }

647	619

648 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {	620 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {

649 const int max_width = width & ~15;	621 const int max_width = width & ~15;

650 int i;	622 int i;

651 for (i = 0; i < max_width; i += 16) {	623 for (i = 0; i < max_width; i += 16) {

652 __m128i r, g, b, Y0, Y1;	624 __m128i Y0, Y1, rgb[6];

653 RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);	625 RGB32PackedToPlanar(&argb[i], rgb);

654 ConvertRGBToY(&r, &g, &b, &Y0);	626 ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);

655 RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);	627 ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);

656 ConvertRGBToY(&r, &g, &b, &Y1);

657 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);	628 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);

658 }	629 }

659 for (; i < width; ++i) { // left-over	630 for (; i < width; ++i) { // left-over

660 const uint32_t p = argb[i];	631 const uint32_t p = argb[i];

661 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,	632 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,

662 YUV_HALF);	633 YUV_HALF);

663 }	634 }

664 }	635 }

665	636

666 // Horizontal add (doubled) of two 16b values, result is 16b.	637 // Horizontal add (doubled) of two 16b values, result is 16b.

667 // in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...	638 // in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...

668 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,	639 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,

669 __m128i* const out) {	640 __m128i* const out) {

670 const __m128i k2 = _mm_set1_epi16(2);	641 const __m128i k2 = _mm_set1_epi16(2);

671 const __m128i C = _mm_madd_epi16(*A, k2);	642 const __m128i C = _mm_madd_epi16(*A, k2);

672 const __m128i D = _mm_madd_epi16(*B, k2);	643 const __m128i D = _mm_madd_epi16(*B, k2);

673 *out = _mm_packs_epi32(C, D);	644 *out = _mm_packs_epi32(C, D);

674 }	645 }

675	646

676 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,	647 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,

677 int src_width, int do_store) {	648 int src_width, int do_store) {

678 const int max_width = src_width & ~31;	649 const int max_width = src_width & ~31;

679 int i;	650 int i;

680 for (i = 0; i < max_width; i += 32, u += 16, v += 16) {	651 for (i = 0; i < max_width; i += 32, u += 16, v += 16) {

681 __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;	652 __m128i rgb[6], U0, V0, U1, V1;

682 RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0);	653 RGB32PackedToPlanar(&argb[i], rgb);

683 RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1);	654 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);

684 HorizontalAddPack(&r0, &r1, &r0);	655 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);

685 HorizontalAddPack(&g0, &g1, &g0);	656 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);

686 HorizontalAddPack(&b0, &b1, &b0);	657 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);

687 ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);

688	658

689 RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);	659 RGB32PackedToPlanar(&argb[i + 16], rgb);

690 RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);	660 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);

691 HorizontalAddPack(&r0, &r1, &r0);	661 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);

692 HorizontalAddPack(&g0, &g1, &g0);	662 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);

693 HorizontalAddPack(&b0, &b1, &b0);	663 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);

694 ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);

695	664

696 U0 = _mm_packus_epi16(U0, U1);	665 U0 = _mm_packus_epi16(U0, U1);

697 V0 = _mm_packus_epi16(V0, V1);	666 V0 = _mm_packus_epi16(V0, V1);

698 if (!do_store) {	667 if (!do_store) {

699 const __m128i prev_u = LOAD_16(u);	668 const __m128i prev_u = LOAD_16(u);

700 const __m128i prev_v = LOAD_16(v);	669 const __m128i prev_v = LOAD_16(v);

701 U0 = _mm_avg_epu8(U0, prev_u);	670 U0 = _mm_avg_epu8(U0, prev_u);

702 V0 = _mm_avg_epu8(V0, prev_v);	671 V0 = _mm_avg_epu8(V0, prev_v);

703 }	672 }

704 STORE_16(U0, u);	673 STORE_16(U0, u);

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
760 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {	729 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {

761 WebPConvertARGBToY = ConvertARGBToY;	730 WebPConvertARGBToY = ConvertARGBToY;

762 WebPConvertARGBToUV = ConvertARGBToUV;	731 WebPConvertARGBToUV = ConvertARGBToUV;

763	732

764 WebPConvertRGB24ToY = ConvertRGB24ToY;	733 WebPConvertRGB24ToY = ConvertRGB24ToY;

765 WebPConvertBGR24ToY = ConvertBGR24ToY;	734 WebPConvertBGR24ToY = ConvertBGR24ToY;

766	735

767 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;	736 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;

768 }	737 }

769	738

	739 //------------------------------------------------------------------------------

	740

	741 #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic

	742 static uint16_t clip_y(int v) {

	743 return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;

	744 }

	745

	746 static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,

	747 uint16_t* dst, int len) {

	748 uint64_t diff = 0;

	749 uint32_t tmp[4];

	750 int i;

	751 const __m128i zero = _mm_setzero_si128();

	752 const __m128i max = _mm_set1_epi16(MAX_Y);

	753 const __m128i one = _mm_set1_epi16(1);

	754 __m128i sum = zero;

	755

	756 for (i = 0; i + 8 <= len; i += 8) {

	757 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));

	758 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));

	759 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));

	760 const __m128i D = _mm_sub_epi16(A, B); // diff_y

	761 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)

	762 const __m128i F = _mm_add_epi16(C, D); // new_y

	763 const __m128i G = _mm_or_si128(E, one); // -1 or 1

	764 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);

	765 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))

	766 _mm_storeu_si128((__m128i*)(dst + i), H);

	767 sum = _mm_add_epi32(sum, I);

	768 }

	769 _mm_storeu_si128((__m128i*)tmp, sum);

	770 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];

	771 for (; i < len; ++i) {

	772 const int diff_y = ref[i] - src[i];

	773 const int new_y = (int)dst[i] + diff_y;

	774 dst[i] = clip_y(new_y);

	775 diff += (uint64_t)abs(diff_y);

	776 }

	777 return diff;

	778 }

	779

	780 static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,

	781 int16_t* dst, int len) {

	782 int i = 0;

	783 for (i = 0; i + 8 <= len; i += 8) {

	784 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));

	785 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));

	786 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));

	787 const __m128i D = _mm_sub_epi16(A, B); // diff_uv

	788 const __m128i E = _mm_add_epi16(C, D); // new_uv

	789 _mm_storeu_si128((__m128i*)(dst + i), E);

	790 }

	791 for (; i < len; ++i) {

	792 const int diff_uv = ref[i] - src[i];

	793 dst[i] += diff_uv;

	794 }

	795 }

	796

	797 static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,

	798 const uint16_t* best_y, uint16_t* out) {

	799 int i;

	800 const __m128i kCst8 = _mm_set1_epi16(8);

	801 const __m128i max = _mm_set1_epi16(MAX_Y);

	802 const __m128i zero = _mm_setzero_si128();

	803 for (i = 0; i + 8 <= len; i += 8) {

	804 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));

	805 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));

	806 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));

	807 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));

	808 const __m128i a0b1 = _mm_add_epi16(a0, b1);

	809 const __m128i a1b0 = _mm_add_epi16(a1, b0);

	810 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1

	811 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);

	812 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)

	813 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)

	814 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);

	815 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);

	816 const __m128i d0 = _mm_add_epi16(c1, a0);

	817 const __m128i d1 = _mm_add_epi16(c0, a1);

	818 const __m128i e0 = _mm_srai_epi16(d0, 1);

	819 const __m128i e1 = _mm_srai_epi16(d1, 1);

	820 const __m128i f0 = _mm_unpacklo_epi16(e0, e1);

	821 const __m128i f1 = _mm_unpackhi_epi16(e0, e1);

	822 const __m128i g0 = _mm_loadu_si128((const __m128i)(best_y + 2 i + 0));

	823 const __m128i g1 = _mm_loadu_si128((const __m128i)(best_y + 2 i + 8));

	824 const __m128i h0 = _mm_add_epi16(g0, f0);

	825 const __m128i h1 = _mm_add_epi16(g1, f1);

	826 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);

	827 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);

	828 _mm_storeu_si128((__m128i)(out + 2 i + 0), i0);

	829 _mm_storeu_si128((__m128i)(out + 2 i + 8), i1);

	830 }

	831 for (; i < len; ++i) {

	832 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =

	833 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4

	834 // We reuse the common sub-expressions.

	835 const int a0b1 = A[i + 0] + B[i + 1];

	836 const int a1b0 = A[i + 1] + B[i + 0];

	837 const int a0a1b0b1 = a0b1 + a1b0 + 8;

	838 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;

	839 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;

	840 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);

	841 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);

	842 }

	843 }

	844

	845 #undef MAX_Y

	846

	847 //------------------------------------------------------------------------------

	848

	849 extern void WebPInitSharpYUVSSE2(void);

	850

	851 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {

	852 WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;

	853 WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;

	854 WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;

	855 }

	856

770 #else // !WEBP_USE_SSE2	857 #else // !WEBP_USE_SSE2

771	858

772 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)	859 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)

773 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)	860 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)

	861 WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)

774	862

775 #endif // WEBP_USE_SSE2	863 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/yuv.c ('k') | third_party/libwebp/enc/alpha.c » ('j') | no next file with comments »