Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(129)

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 version of speed-critical encoding functions. 10 // SSE2 version of speed-critical encoding functions.
11 // 11 //
12 // Author: Christian Duvivier (cduvivier@google.com) 12 // Author: Christian Duvivier (cduvivier@google.com)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(WEBP_USE_SSE2) 16 #if defined(WEBP_USE_SSE2)
17 #include <assert.h>
17 #include <stdlib.h> // for abs() 18 #include <stdlib.h> // for abs()
18 #include <emmintrin.h> 19 #include <emmintrin.h>
19 20
20 #include "./common_sse2.h" 21 #include "./common_sse2.h"
21 #include "../enc/cost.h" 22 #include "../enc/cost_enc.h"
22 #include "../enc/vp8enci.h" 23 #include "../enc/vp8i_enc.h"
23 24
24 //------------------------------------------------------------------------------ 25 //------------------------------------------------------------------------------
25 // Transforms (Paragraph 14.4) 26 // Transforms (Paragraph 14.4)
26 27
27 // Does one or two inverse transforms. 28 // Does one or two inverse transforms.
28 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 29 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
29 int do_two) { 30 int do_two) {
30 // This implementation makes use of 16-bit fixed point versions of two 31 // This implementation makes use of 16-bit fixed point versions of two
31 // multiply constants: 32 // multiply constants:
32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 33 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after
132 const __m128i tmp1 = _mm_add_epi16(b, c); 133 const __m128i tmp1 = _mm_add_epi16(b, c);
133 const __m128i tmp2 = _mm_sub_epi16(b, c); 134 const __m128i tmp2 = _mm_sub_epi16(b, c);
134 const __m128i tmp3 = _mm_sub_epi16(a, d); 135 const __m128i tmp3 = _mm_sub_epi16(a, d);
135 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
136 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
137 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
138 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
139 140
140 // Transpose the two 4x4. 141 // Transpose the two 4x4.
141 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
142 &T2, &T3); 143 &T2, &T3);
143 } 144 }
144 145
145 // Add inverse transform to 'ref' and store. 146 // Add inverse transform to 'ref' and store.
146 { 147 {
147 const __m128i zero = _mm_setzero_si128(); 148 const __m128i zero = _mm_setzero_si128();
148 // Load the reference(s). 149 // Load the reference(s).
149 __m128i ref0, ref1, ref2, ref3; 150 __m128i ref0, ref1, ref2, ref3;
150 if (do_two) { 151 if (do_two) {
151 // Load eight bytes/pixels per line. 152 // Load eight bytes/pixels per line.
152 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 153 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
243 const __m128i zero = _mm_setzero_si128(); 244 const __m128i zero = _mm_setzero_si128();
244 const __m128i seven = _mm_set1_epi16(7); 245 const __m128i seven = _mm_set1_epi16(7);
245 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 246 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
246 5352, 2217, 5352, 2217); 247 5352, 2217, 5352, 2217);
247 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 248 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
248 2217, -5352, 2217, -5352); 249 2217, -5352, 2217, -5352);
249 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); 250 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
250 const __m128i k51000 = _mm_set1_epi32(51000); 251 const __m128i k51000 = _mm_set1_epi32(51000);
251 252
252 // Same operations are done on the (0,3) and (1,2) pairs. 253 // Same operations are done on the (0,3) and (1,2) pairs.
253 // a0 = v0 + v3
254 // a1 = v1 + v2
255 // a3 = v0 - v3 254 // a3 = v0 - v3
256 // a2 = v1 - v2 255 // a2 = v1 - v2
257 const __m128i a01 = _mm_add_epi16(*v01, *v32);
258 const __m128i a32 = _mm_sub_epi16(*v01, *v32); 256 const __m128i a32 = _mm_sub_epi16(*v01, *v32);
259 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
260 const __m128i a22 = _mm_unpackhi_epi64(a32, a32); 257 const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
261 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
262 258
263 // d0 = (a0 + a1 + 7) >> 4;
264 // d2 = (a0 - a1 + 7) >> 4;
265 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
266 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
267 const __m128i d0 = _mm_srai_epi16(c0, 4);
268 const __m128i d2 = _mm_srai_epi16(c2, 4);
269
270 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
271 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
272 const __m128i b23 = _mm_unpacklo_epi16(a22, a32); 259 const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
273 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); 260 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
274 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); 261 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
275 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); 262 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
276 const __m128i d3 = _mm_add_epi32(c3, k51000); 263 const __m128i d3 = _mm_add_epi32(c3, k51000);
277 const __m128i e1 = _mm_srai_epi32(d1, 16); 264 const __m128i e1 = _mm_srai_epi32(d1, 16);
278 const __m128i e3 = _mm_srai_epi32(d3, 16); 265 const __m128i e3 = _mm_srai_epi32(d3, 16);
266 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
267 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
279 const __m128i f1 = _mm_packs_epi32(e1, e1); 268 const __m128i f1 = _mm_packs_epi32(e1, e1);
280 const __m128i f3 = _mm_packs_epi32(e3, e3); 269 const __m128i f3 = _mm_packs_epi32(e3, e3);
281 // f1 = f1 + (a3 != 0); 270 // g1 = f1 + (a3 != 0);
282 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the 271 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
283 // desired (0, 1), we add one earlier through k12000_plus_one. 272 // desired (0, 1), we add one earlier through k12000_plus_one.
284 // -> f1 = f1 + 1 - (a3 == 0) 273 // -> g1 = f1 + 1 - (a3 == 0)
285 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 274 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
286 275
276 // a0 = v0 + v3
277 // a1 = v1 + v2
278 const __m128i a01 = _mm_add_epi16(*v01, *v32);
279 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
280 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
281 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
282 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
283 // d0 = (a0 + a1 + 7) >> 4;
284 // d2 = (a0 - a1 + 7) >> 4;
285 const __m128i d0 = _mm_srai_epi16(c0, 4);
286 const __m128i d2 = _mm_srai_epi16(c2, 4);
287
287 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); 288 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
288 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); 289 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
289 _mm_storeu_si128((__m128i*)&out[0], d0_g1); 290 _mm_storeu_si128((__m128i*)&out[0], d0_g1);
290 _mm_storeu_si128((__m128i*)&out[8], d2_f3); 291 _mm_storeu_si128((__m128i*)&out[8], d2_f3);
291 } 292 }
292 293
293 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 294 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
294 const __m128i zero = _mm_setzero_si128(); 295 const __m128i zero = _mm_setzero_si128();
295 // Load src. 296 // Load src.
296 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 297 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
(...skipping 742 matching lines...) Expand 10 before | Expand all | Expand 10 after
1039 const __m128i e0 = _mm_madd_epi16(d0, d0); 1040 const __m128i e0 = _mm_madd_epi16(d0, d0);
1040 const __m128i e1 = _mm_madd_epi16(d1, d1); 1041 const __m128i e1 = _mm_madd_epi16(d1, d1);
1041 const __m128i sum = _mm_add_epi32(e0, e1); 1042 const __m128i sum = _mm_add_epi32(e0, e1);
1042 1043
1043 int32_t tmp[4]; 1044 int32_t tmp[4];
1044 _mm_storeu_si128((__m128i*)tmp, sum); 1045 _mm_storeu_si128((__m128i*)tmp, sum);
1045 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1046 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
1046 } 1047 }
1047 1048
1048 //------------------------------------------------------------------------------ 1049 //------------------------------------------------------------------------------
1050
1051 static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
1052 const __m128i mask = _mm_set1_epi16(0x00ff);
1053 const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
1054 const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
1055 const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
1056 const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
1057 const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte
1058 const __m128i b1 = _mm_srli_epi16(a1, 8);
1059 const __m128i b2 = _mm_srli_epi16(a2, 8);
1060 const __m128i b3 = _mm_srli_epi16(a3, 8);
1061 const __m128i c0 = _mm_and_si128(a0, mask); // lo byte
1062 const __m128i c1 = _mm_and_si128(a1, mask);
1063 const __m128i c2 = _mm_and_si128(a2, mask);
1064 const __m128i c3 = _mm_and_si128(a3, mask);
1065 const __m128i d0 = _mm_add_epi32(b0, c0);
1066 const __m128i d1 = _mm_add_epi32(b1, c1);
1067 const __m128i d2 = _mm_add_epi32(b2, c2);
1068 const __m128i d3 = _mm_add_epi32(b3, c3);
1069 const __m128i e0 = _mm_add_epi32(d0, d1);
1070 const __m128i e1 = _mm_add_epi32(d2, d3);
1071 const __m128i f0 = _mm_add_epi32(e0, e1);
1072 uint16_t tmp[8];
1073 _mm_storeu_si128((__m128i*)tmp, f0);
1074 dc[0] = tmp[0] + tmp[1];
1075 dc[1] = tmp[2] + tmp[3];
1076 dc[2] = tmp[4] + tmp[5];
1077 dc[3] = tmp[6] + tmp[7];
1078 }
1079
1080 //------------------------------------------------------------------------------
1049 // Texture distortion 1081 // Texture distortion
1050 // 1082 //
1051 // We try to match the spectral content (weighted) between source and 1083 // We try to match the spectral content (weighted) between source and
1052 // reconstructed samples. 1084 // reconstructed samples.
1053 1085
1054 // Hadamard transform 1086 // Hadamard transform
1055 // Returns the weighted sum of the absolute value of transformed coefficients. 1087 // Returns the weighted sum of the absolute value of transformed coefficients.
1056 // w[] contains a row-major 4 by 4 symmetric matrix. 1088 // w[] contains a row-major 4 by 4 symmetric matrix.
1057 static int TTransform(const uint8_t* inA, const uint8_t* inB, 1089 static int TTransform(const uint8_t* inA, const uint8_t* inB,
1058 const uint16_t* const w) { 1090 const uint16_t* const w) {
(...skipping 265 matching lines...) Expand 10 before | Expand all | Expand 10 after
1324 VP8ITransform = ITransform; 1356 VP8ITransform = ITransform;
1325 VP8FTransform = FTransform; 1357 VP8FTransform = FTransform;
1326 VP8FTransform2 = FTransform2; 1358 VP8FTransform2 = FTransform2;
1327 VP8FTransformWHT = FTransformWHT; 1359 VP8FTransformWHT = FTransformWHT;
1328 VP8SSE16x16 = SSE16x16; 1360 VP8SSE16x16 = SSE16x16;
1329 VP8SSE16x8 = SSE16x8; 1361 VP8SSE16x8 = SSE16x8;
1330 VP8SSE8x8 = SSE8x8; 1362 VP8SSE8x8 = SSE8x8;
1331 VP8SSE4x4 = SSE4x4; 1363 VP8SSE4x4 = SSE4x4;
1332 VP8TDisto4x4 = Disto4x4; 1364 VP8TDisto4x4 = Disto4x4;
1333 VP8TDisto16x16 = Disto16x16; 1365 VP8TDisto16x16 = Disto16x16;
1366 VP8Mean16x4 = Mean16x4;
1367 }
1368
1369 //------------------------------------------------------------------------------
1370 // SSIM / PSNR entry point (TODO(skal): move to its own file later)
1371
1372 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
1373 const uint8_t* src2, int len) {
1374 int i = 0;
1375 uint32_t sse2 = 0;
1376 if (len >= 16) {
1377 const int limit = len - 32;
1378 int32_t tmp[4];
1379 __m128i sum1;
1380 __m128i sum = _mm_setzero_si128();
1381 __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
1382 __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
1383 i += 16;
1384 while (i <= limit) {
1385 const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
1386 const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
1387 __m128i sum2;
1388 i += 16;
1389 SubtractAndAccumulate(a0, b0, &sum1);
1390 sum = _mm_add_epi32(sum, sum1);
1391 a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
1392 b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
1393 i += 16;
1394 SubtractAndAccumulate(a1, b1, &sum2);
1395 sum = _mm_add_epi32(sum, sum2);
1396 }
1397 SubtractAndAccumulate(a0, b0, &sum1);
1398 sum = _mm_add_epi32(sum, sum1);
1399 _mm_storeu_si128((__m128i*)tmp, sum);
1400 sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
1401 }
1402
1403 for (; i < len; ++i) {
1404 const int32_t diff = src1[i] - src2[i];
1405 sse2 += diff * diff;
1406 }
1407 return sse2;
1408 }
1409
1410 static uint32_t HorizontalAdd16b(const __m128i* const m) {
1411 uint16_t tmp[8];
1412 const __m128i a = _mm_srli_si128(*m, 8);
1413 const __m128i b = _mm_add_epi16(*m, a);
1414 _mm_storeu_si128((__m128i*)tmp, b);
1415 return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
1416 }
1417
1418 static uint32_t HorizontalAdd32b(const __m128i* const m) {
1419 const __m128i a = _mm_srli_si128(*m, 8);
1420 const __m128i b = _mm_add_epi32(*m, a);
1421 const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
1422 return (uint32_t)_mm_cvtsi128_si32(c);
1423 }
1424
1425 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
1426
1427 #define ACCUMULATE_ROW(WEIGHT) do { \
1428 /* compute row weight (Wx * Wy) */ \
1429 const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
1430 const __m128i W = _mm_mullo_epi16(Wx, Wy); \
1431 /* process 8 bytes at a time (7 bytes, actually) */ \
1432 const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
1433 const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
1434 /* convert to 16b and multiply by weight */ \
1435 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
1436 const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
1437 const __m128i wa1 = _mm_mullo_epi16(a1, W); \
1438 const __m128i wb1 = _mm_mullo_epi16(b1, W); \
1439 /* accumulate */ \
1440 xm = _mm_add_epi16(xm, wa1); \
1441 ym = _mm_add_epi16(ym, wb1); \
1442 xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
1443 xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
1444 yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
1445 src1 += stride1; \
1446 src2 += stride2; \
1447 } while (0)
1448
1449 static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
1450 const uint8_t* src2, int stride2) {
1451 VP8DistoStats stats;
1452 const __m128i zero = _mm_setzero_si128();
1453 __m128i xm = zero, ym = zero; // 16b accums
1454 __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
1455 const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
1456 assert(2 * VP8_SSIM_KERNEL + 1 == 7);
1457 ACCUMULATE_ROW(1);
1458 ACCUMULATE_ROW(2);
1459 ACCUMULATE_ROW(3);
1460 ACCUMULATE_ROW(4);
1461 ACCUMULATE_ROW(3);
1462 ACCUMULATE_ROW(2);
1463 ACCUMULATE_ROW(1);
1464 stats.xm = HorizontalAdd16b(&xm);
1465 stats.ym = HorizontalAdd16b(&ym);
1466 stats.xxm = HorizontalAdd32b(&xxm);
1467 stats.xym = HorizontalAdd32b(&xym);
1468 stats.yym = HorizontalAdd32b(&yym);
1469 return VP8SSIMFromStats(&stats);
1470 }
1471
1472 extern void VP8SSIMDspInitSSE2(void);
1473
1474 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
1475 VP8AccumulateSSE = AccumulateSSE_SSE2;
1476 VP8SSIMGet = SSIMGet_SSE2;
1334 } 1477 }
1335 1478
1336 #else // !WEBP_USE_SSE2 1479 #else // !WEBP_USE_SSE2
1337 1480
1338 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) 1481 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
1482 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
1339 1483
1340 #endif // WEBP_USE_SSE2 1484 #endif // WEBP_USE_SSE2
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698