third_party/libwebp/dsp/enc_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of speed-critical encoding functions.	10 // SSE2 version of speed-critical encoding functions.

11 //	11 //

12 // Author: Christian Duvivier (cduvivier@google.com)	12 // Author: Christian Duvivier (cduvivier@google.com)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

	17 #include <assert.h>

17 #include <stdlib.h> // for abs()	18 #include <stdlib.h> // for abs()

18 #include <emmintrin.h>	19 #include <emmintrin.h>

19	20

20 #include "./common_sse2.h"	21 #include "./common_sse2.h"

21 #include "../enc/cost.h"	22 #include "../enc/cost_enc.h"

22 #include "../enc/vp8enci.h"	23 #include "../enc/vp8i_enc.h"

23	24

24 //------------------------------------------------------------------------------	25 //------------------------------------------------------------------------------

25 // Transforms (Paragraph 14.4)	26 // Transforms (Paragraph 14.4)

26	27

27 // Does one or two inverse transforms.	28 // Does one or two inverse transforms.

28 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,	29 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,

29 int do_two) {	30 int do_two) {

30 // This implementation makes use of 16-bit fixed point versions of two	31 // This implementation makes use of 16-bit fixed point versions of two

31 // multiply constants:	32 // multiply constants:

32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16	33 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16

(...skipping 99 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
132 const __m128i tmp1 = _mm_add_epi16(b, c);	133 const __m128i tmp1 = _mm_add_epi16(b, c);

133 const __m128i tmp2 = _mm_sub_epi16(b, c);	134 const __m128i tmp2 = _mm_sub_epi16(b, c);

134 const __m128i tmp3 = _mm_sub_epi16(a, d);	135 const __m128i tmp3 = _mm_sub_epi16(a, d);

135 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);	136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);

136 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);	137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);

137 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);	138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);

138 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);	139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

139	140

140 // Transpose the two 4x4.	141 // Transpose the two 4x4.

141 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,	142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,

142 &T2, &T3);	143 &T2, &T3);

143 }	144 }

144	145

145 // Add inverse transform to 'ref' and store.	146 // Add inverse transform to 'ref' and store.

146 {	147 {

147 const __m128i zero = _mm_setzero_si128();	148 const __m128i zero = _mm_setzero_si128();

148 // Load the reference(s).	149 // Load the reference(s).

149 __m128i ref0, ref1, ref2, ref3;	150 __m128i ref0, ref1, ref2, ref3;

150 if (do_two) {	151 if (do_two) {

151 // Load eight bytes/pixels per line.	152 // Load eight bytes/pixels per line.

152 ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);	153 ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
243 const __m128i zero = _mm_setzero_si128();	244 const __m128i zero = _mm_setzero_si128();

244 const __m128i seven = _mm_set1_epi16(7);	245 const __m128i seven = _mm_set1_epi16(7);

245 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,	246 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,

246 5352, 2217, 5352, 2217);	247 5352, 2217, 5352, 2217);

247 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,	248 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,

248 2217, -5352, 2217, -5352);	249 2217, -5352, 2217, -5352);

249 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));	250 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));

250 const __m128i k51000 = _mm_set1_epi32(51000);	251 const __m128i k51000 = _mm_set1_epi32(51000);

251	252

252 // Same operations are done on the (0,3) and (1,2) pairs.	253 // Same operations are done on the (0,3) and (1,2) pairs.

253 // a0 = v0 + v3

254 // a1 = v1 + v2

255 // a3 = v0 - v3	254 // a3 = v0 - v3

256 // a2 = v1 - v2	255 // a2 = v1 - v2

257 const __m128i a01 = _mm_add_epi16(v01, v32);

258 const __m128i a32 = _mm_sub_epi16(v01, v32);	256 const __m128i a32 = _mm_sub_epi16(v01, v32);

259 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);

260 const __m128i a22 = _mm_unpackhi_epi64(a32, a32);	257 const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

261 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

262	258

263 // d0 = (a0 + a1 + 7) >> 4;

264 // d2 = (a0 - a1 + 7) >> 4;

265 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);

266 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);

267 const __m128i d0 = _mm_srai_epi16(c0, 4);

268 const __m128i d2 = _mm_srai_epi16(c2, 4);

269

270 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)

271 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)

272 const __m128i b23 = _mm_unpacklo_epi16(a22, a32);	259 const __m128i b23 = _mm_unpacklo_epi16(a22, a32);

273 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);	260 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);

274 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);	261 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);

275 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);	262 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);

276 const __m128i d3 = _mm_add_epi32(c3, k51000);	263 const __m128i d3 = _mm_add_epi32(c3, k51000);

277 const __m128i e1 = _mm_srai_epi32(d1, 16);	264 const __m128i e1 = _mm_srai_epi32(d1, 16);

278 const __m128i e3 = _mm_srai_epi32(d3, 16);	265 const __m128i e3 = _mm_srai_epi32(d3, 16);

	266 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)

	267 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)

279 const __m128i f1 = _mm_packs_epi32(e1, e1);	268 const __m128i f1 = _mm_packs_epi32(e1, e1);

280 const __m128i f3 = _mm_packs_epi32(e3, e3);	269 const __m128i f3 = _mm_packs_epi32(e3, e3);

281 // f1 = f1 + (a3 != 0);	270 // g1 = f1 + (a3 != 0);

282 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the	271 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the

283 // desired (0, 1), we add one earlier through k12000_plus_one.	272 // desired (0, 1), we add one earlier through k12000_plus_one.

284 // -> f1 = f1 + 1 - (a3 == 0)	273 // -> g1 = f1 + 1 - (a3 == 0)

285 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));	274 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

286	275

	276 // a0 = v0 + v3

	277 // a1 = v1 + v2

	278 const __m128i a01 = _mm_add_epi16(v01, v32);

	279 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

	280 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);

	281 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);

	282 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);

	283 // d0 = (a0 + a1 + 7) >> 4;

	284 // d2 = (a0 - a1 + 7) >> 4;

	285 const __m128i d0 = _mm_srai_epi16(c0, 4);

	286 const __m128i d2 = _mm_srai_epi16(c2, 4);

	287

287 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);	288 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);

288 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);	289 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);

289 _mm_storeu_si128((__m128i*)&out[0], d0_g1);	290 _mm_storeu_si128((__m128i*)&out[0], d0_g1);

290 _mm_storeu_si128((__m128i*)&out[8], d2_f3);	291 _mm_storeu_si128((__m128i*)&out[8], d2_f3);

291 }	292 }

292	293

293 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {	294 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {

294 const __m128i zero = _mm_setzero_si128();	295 const __m128i zero = _mm_setzero_si128();

295 // Load src.	296 // Load src.

296 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);	297 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);

(...skipping 742 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1039 const __m128i e0 = _mm_madd_epi16(d0, d0);	1040 const __m128i e0 = _mm_madd_epi16(d0, d0);

1040 const __m128i e1 = _mm_madd_epi16(d1, d1);	1041 const __m128i e1 = _mm_madd_epi16(d1, d1);

1041 const __m128i sum = _mm_add_epi32(e0, e1);	1042 const __m128i sum = _mm_add_epi32(e0, e1);

1042	1043

1043 int32_t tmp[4];	1044 int32_t tmp[4];

1044 _mm_storeu_si128((__m128i*)tmp, sum);	1045 _mm_storeu_si128((__m128i*)tmp, sum);

1045 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);	1046 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);

1046 }	1047 }

1047	1048

1048 //------------------------------------------------------------------------------	1049 //------------------------------------------------------------------------------

	1050

	1051 static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {

	1052 const __m128i mask = _mm_set1_epi16(0x00ff);

	1053 const __m128i a0 = _mm_loadu_si128((const __m128i)&ref[BPS 0]);

	1054 const __m128i a1 = _mm_loadu_si128((const __m128i)&ref[BPS 1]);

	1055 const __m128i a2 = _mm_loadu_si128((const __m128i)&ref[BPS 2]);

	1056 const __m128i a3 = _mm_loadu_si128((const __m128i)&ref[BPS 3]);

	1057 const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte

	1058 const __m128i b1 = _mm_srli_epi16(a1, 8);

	1059 const __m128i b2 = _mm_srli_epi16(a2, 8);

	1060 const __m128i b3 = _mm_srli_epi16(a3, 8);

	1061 const __m128i c0 = _mm_and_si128(a0, mask); // lo byte

	1062 const __m128i c1 = _mm_and_si128(a1, mask);

	1063 const __m128i c2 = _mm_and_si128(a2, mask);

	1064 const __m128i c3 = _mm_and_si128(a3, mask);

	1065 const __m128i d0 = _mm_add_epi32(b0, c0);

	1066 const __m128i d1 = _mm_add_epi32(b1, c1);

	1067 const __m128i d2 = _mm_add_epi32(b2, c2);

	1068 const __m128i d3 = _mm_add_epi32(b3, c3);

	1069 const __m128i e0 = _mm_add_epi32(d0, d1);

	1070 const __m128i e1 = _mm_add_epi32(d2, d3);

	1071 const __m128i f0 = _mm_add_epi32(e0, e1);

	1072 uint16_t tmp[8];

	1073 _mm_storeu_si128((__m128i*)tmp, f0);

	1074 dc[0] = tmp[0] + tmp[1];

	1075 dc[1] = tmp[2] + tmp[3];

	1076 dc[2] = tmp[4] + tmp[5];

	1077 dc[3] = tmp[6] + tmp[7];

	1078 }

	1079

	1080 //------------------------------------------------------------------------------

1049 // Texture distortion	1081 // Texture distortion

1050 //	1082 //

1051 // We try to match the spectral content (weighted) between source and	1083 // We try to match the spectral content (weighted) between source and

1052 // reconstructed samples.	1084 // reconstructed samples.

1053	1085

1054 // Hadamard transform	1086 // Hadamard transform

1055 // Returns the weighted sum of the absolute value of transformed coefficients.	1087 // Returns the weighted sum of the absolute value of transformed coefficients.

1056 // w[] contains a row-major 4 by 4 symmetric matrix.	1088 // w[] contains a row-major 4 by 4 symmetric matrix.

1057 static int TTransform(const uint8_t* inA, const uint8_t* inB,	1089 static int TTransform(const uint8_t* inA, const uint8_t* inB,

1058 const uint16_t* const w) {	1090 const uint16_t* const w) {

(...skipping 265 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1324 VP8ITransform = ITransform;	1356 VP8ITransform = ITransform;

1325 VP8FTransform = FTransform;	1357 VP8FTransform = FTransform;

1326 VP8FTransform2 = FTransform2;	1358 VP8FTransform2 = FTransform2;

1327 VP8FTransformWHT = FTransformWHT;	1359 VP8FTransformWHT = FTransformWHT;

1328 VP8SSE16x16 = SSE16x16;	1360 VP8SSE16x16 = SSE16x16;

1329 VP8SSE16x8 = SSE16x8;	1361 VP8SSE16x8 = SSE16x8;

1330 VP8SSE8x8 = SSE8x8;	1362 VP8SSE8x8 = SSE8x8;

1331 VP8SSE4x4 = SSE4x4;	1363 VP8SSE4x4 = SSE4x4;

1332 VP8TDisto4x4 = Disto4x4;	1364 VP8TDisto4x4 = Disto4x4;

1333 VP8TDisto16x16 = Disto16x16;	1365 VP8TDisto16x16 = Disto16x16;

	1366 VP8Mean16x4 = Mean16x4;

	1367 }

	1368

	1369 //------------------------------------------------------------------------------

	1370 // SSIM / PSNR entry point (TODO(skal): move to its own file later)

	1371

	1372 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,

	1373 const uint8_t* src2, int len) {

	1374 int i = 0;

	1375 uint32_t sse2 = 0;

	1376 if (len >= 16) {

	1377 const int limit = len - 32;

	1378 int32_t tmp[4];

	1379 __m128i sum1;

	1380 __m128i sum = _mm_setzero_si128();

	1381 __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);

	1382 __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);

	1383 i += 16;

	1384 while (i <= limit) {

	1385 const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);

	1386 const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);

	1387 __m128i sum2;

	1388 i += 16;

	1389 SubtractAndAccumulate(a0, b0, &sum1);

	1390 sum = _mm_add_epi32(sum, sum1);

	1391 a0 = _mm_loadu_si128((const __m128i*)&src1[i]);

	1392 b0 = _mm_loadu_si128((const __m128i*)&src2[i]);

	1393 i += 16;

	1394 SubtractAndAccumulate(a1, b1, &sum2);

	1395 sum = _mm_add_epi32(sum, sum2);

	1396 }

	1397 SubtractAndAccumulate(a0, b0, &sum1);

	1398 sum = _mm_add_epi32(sum, sum1);

	1399 _mm_storeu_si128((__m128i*)tmp, sum);

	1400 sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);

	1401 }

	1402

	1403 for (; i < len; ++i) {

	1404 const int32_t diff = src1[i] - src2[i];

	1405 sse2 += diff * diff;

	1406 }

	1407 return sse2;

	1408 }

	1409

	1410 static uint32_t HorizontalAdd16b(const __m128i* const m) {

	1411 uint16_t tmp[8];

	1412 const __m128i a = _mm_srli_si128(*m, 8);

	1413 const __m128i b = _mm_add_epi16(*m, a);

	1414 _mm_storeu_si128((__m128i*)tmp, b);

	1415 return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];

	1416 }

	1417

	1418 static uint32_t HorizontalAdd32b(const __m128i* const m) {

	1419 const __m128i a = _mm_srli_si128(*m, 8);

	1420 const __m128i b = _mm_add_epi32(*m, a);

	1421 const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));

	1422 return (uint32_t)_mm_cvtsi128_si32(c);

	1423 }

	1424

	1425 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };

	1426

	1427 #define ACCUMULATE_ROW(WEIGHT) do { \

	1428 /* compute row weight (Wx * Wy) */ \

	1429 const __m128i Wy = _mm_set1_epi16((WEIGHT)); \

	1430 const __m128i W = _mm_mullo_epi16(Wx, Wy); \

	1431 /* process 8 bytes at a time (7 bytes, actually) */ \

	1432 const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \

	1433 const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \

	1434 /* convert to 16b and multiply by weight */ \

	1435 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \

	1436 const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \

	1437 const __m128i wa1 = _mm_mullo_epi16(a1, W); \

	1438 const __m128i wb1 = _mm_mullo_epi16(b1, W); \

	1439 /* accumulate */ \

	1440 xm = _mm_add_epi16(xm, wa1); \

	1441 ym = _mm_add_epi16(ym, wb1); \

	1442 xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \

	1443 xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \

	1444 yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \

	1445 src1 += stride1; \

	1446 src2 += stride2; \

	1447 } while (0)

	1448

	1449 static double SSIMGet_SSE2(const uint8_t* src1, int stride1,

	1450 const uint8_t* src2, int stride2) {

	1451 VP8DistoStats stats;

	1452 const __m128i zero = _mm_setzero_si128();

	1453 __m128i xm = zero, ym = zero; // 16b accums

	1454 __m128i xxm = zero, yym = zero, xym = zero; // 32b accum

	1455 const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);

	1456 assert(2 * VP8_SSIM_KERNEL + 1 == 7);

	1457 ACCUMULATE_ROW(1);

	1458 ACCUMULATE_ROW(2);

	1459 ACCUMULATE_ROW(3);

	1460 ACCUMULATE_ROW(4);

	1461 ACCUMULATE_ROW(3);

	1462 ACCUMULATE_ROW(2);

	1463 ACCUMULATE_ROW(1);

	1464 stats.xm = HorizontalAdd16b(&xm);

	1465 stats.ym = HorizontalAdd16b(&ym);

	1466 stats.xxm = HorizontalAdd32b(&xxm);

	1467 stats.xym = HorizontalAdd32b(&xym);

	1468 stats.yym = HorizontalAdd32b(&yym);

	1469 return VP8SSIMFromStats(&stats);

	1470 }

	1471

	1472 extern void VP8SSIMDspInitSSE2(void);

	1473

	1474 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {

	1475 VP8AccumulateSSE = AccumulateSSE_SSE2;

	1476 VP8SSIMGet = SSIMGet_SSE2;

1334 }	1477 }

1335	1478

1336 #else // !WEBP_USE_SSE2	1479 #else // !WEBP_USE_SSE2

1337	1480

1338 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)	1481 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)

	1482 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)

1339	1483

1340 #endif // WEBP_USE_SSE2	1484 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »