| OLD | NEW |
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // SSE2 version of speed-critical encoding functions. | 10 // SSE2 version of speed-critical encoding functions. |
| 11 // | 11 // |
| 12 // Author: Christian Duvivier (cduvivier@google.com) | 12 // Author: Christian Duvivier (cduvivier@google.com) |
| 13 | 13 |
| 14 #include "./dsp.h" | 14 #include "./dsp.h" |
| 15 | 15 |
| 16 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
| 17 #include <assert.h> |
| 17 #include <stdlib.h> // for abs() | 18 #include <stdlib.h> // for abs() |
| 18 #include <emmintrin.h> | 19 #include <emmintrin.h> |
| 19 | 20 |
| 20 #include "./common_sse2.h" | 21 #include "./common_sse2.h" |
| 21 #include "../enc/cost.h" | 22 #include "../enc/cost_enc.h" |
| 22 #include "../enc/vp8enci.h" | 23 #include "../enc/vp8i_enc.h" |
| 23 | 24 |
| 24 //------------------------------------------------------------------------------ | 25 //------------------------------------------------------------------------------ |
| 25 // Transforms (Paragraph 14.4) | 26 // Transforms (Paragraph 14.4) |
| 26 | 27 |
| 27 // Does one or two inverse transforms. | 28 // Does one or two inverse transforms. |
| 28 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, | 29 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, |
| 29 int do_two) { | 30 int do_two) { |
| 30 // This implementation makes use of 16-bit fixed point versions of two | 31 // This implementation makes use of 16-bit fixed point versions of two |
| 31 // multiply constants: | 32 // multiply constants: |
| 32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | 33 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 |
| (...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 132 const __m128i tmp1 = _mm_add_epi16(b, c); | 133 const __m128i tmp1 = _mm_add_epi16(b, c); |
| 133 const __m128i tmp2 = _mm_sub_epi16(b, c); | 134 const __m128i tmp2 = _mm_sub_epi16(b, c); |
| 134 const __m128i tmp3 = _mm_sub_epi16(a, d); | 135 const __m128i tmp3 = _mm_sub_epi16(a, d); |
| 135 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); | 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); |
| 136 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); | 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); |
| 137 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); | 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); |
| 138 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); | 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); |
| 139 | 140 |
| 140 // Transpose the two 4x4. | 141 // Transpose the two 4x4. |
| 141 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, | 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, |
| 142 &T2, &T3); | 143 &T2, &T3); |
| 143 } | 144 } |
| 144 | 145 |
| 145 // Add inverse transform to 'ref' and store. | 146 // Add inverse transform to 'ref' and store. |
| 146 { | 147 { |
| 147 const __m128i zero = _mm_setzero_si128(); | 148 const __m128i zero = _mm_setzero_si128(); |
| 148 // Load the reference(s). | 149 // Load the reference(s). |
| 149 __m128i ref0, ref1, ref2, ref3; | 150 __m128i ref0, ref1, ref2, ref3; |
| 150 if (do_two) { | 151 if (do_two) { |
| 151 // Load eight bytes/pixels per line. | 152 // Load eight bytes/pixels per line. |
| 152 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); | 153 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 243 const __m128i zero = _mm_setzero_si128(); | 244 const __m128i zero = _mm_setzero_si128(); |
| 244 const __m128i seven = _mm_set1_epi16(7); | 245 const __m128i seven = _mm_set1_epi16(7); |
| 245 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, | 246 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, |
| 246 5352, 2217, 5352, 2217); | 247 5352, 2217, 5352, 2217); |
| 247 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, | 248 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, |
| 248 2217, -5352, 2217, -5352); | 249 2217, -5352, 2217, -5352); |
| 249 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); | 250 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); |
| 250 const __m128i k51000 = _mm_set1_epi32(51000); | 251 const __m128i k51000 = _mm_set1_epi32(51000); |
| 251 | 252 |
| 252 // Same operations are done on the (0,3) and (1,2) pairs. | 253 // Same operations are done on the (0,3) and (1,2) pairs. |
| 253 // a0 = v0 + v3 | |
| 254 // a1 = v1 + v2 | |
| 255 // a3 = v0 - v3 | 254 // a3 = v0 - v3 |
| 256 // a2 = v1 - v2 | 255 // a2 = v1 - v2 |
| 257 const __m128i a01 = _mm_add_epi16(*v01, *v32); | |
| 258 const __m128i a32 = _mm_sub_epi16(*v01, *v32); | 256 const __m128i a32 = _mm_sub_epi16(*v01, *v32); |
| 259 const __m128i a11 = _mm_unpackhi_epi64(a01, a01); | |
| 260 const __m128i a22 = _mm_unpackhi_epi64(a32, a32); | 257 const __m128i a22 = _mm_unpackhi_epi64(a32, a32); |
| 261 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); | |
| 262 | 258 |
| 263 // d0 = (a0 + a1 + 7) >> 4; | |
| 264 // d2 = (a0 - a1 + 7) >> 4; | |
| 265 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); | |
| 266 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); | |
| 267 const __m128i d0 = _mm_srai_epi16(c0, 4); | |
| 268 const __m128i d2 = _mm_srai_epi16(c2, 4); | |
| 269 | |
| 270 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) | |
| 271 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) | |
| 272 const __m128i b23 = _mm_unpacklo_epi16(a22, a32); | 259 const __m128i b23 = _mm_unpacklo_epi16(a22, a32); |
| 273 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); | 260 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); |
| 274 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); | 261 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); |
| 275 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); | 262 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); |
| 276 const __m128i d3 = _mm_add_epi32(c3, k51000); | 263 const __m128i d3 = _mm_add_epi32(c3, k51000); |
| 277 const __m128i e1 = _mm_srai_epi32(d1, 16); | 264 const __m128i e1 = _mm_srai_epi32(d1, 16); |
| 278 const __m128i e3 = _mm_srai_epi32(d3, 16); | 265 const __m128i e3 = _mm_srai_epi32(d3, 16); |
| 266 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) |
| 267 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) |
| 279 const __m128i f1 = _mm_packs_epi32(e1, e1); | 268 const __m128i f1 = _mm_packs_epi32(e1, e1); |
| 280 const __m128i f3 = _mm_packs_epi32(e3, e3); | 269 const __m128i f3 = _mm_packs_epi32(e3, e3); |
| 281 // f1 = f1 + (a3 != 0); | 270 // g1 = f1 + (a3 != 0); |
| 282 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the | 271 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the |
| 283 // desired (0, 1), we add one earlier through k12000_plus_one. | 272 // desired (0, 1), we add one earlier through k12000_plus_one. |
| 284 // -> f1 = f1 + 1 - (a3 == 0) | 273 // -> g1 = f1 + 1 - (a3 == 0) |
| 285 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); | 274 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); |
| 286 | 275 |
| 276 // a0 = v0 + v3 |
| 277 // a1 = v1 + v2 |
| 278 const __m128i a01 = _mm_add_epi16(*v01, *v32); |
| 279 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); |
| 280 const __m128i a11 = _mm_unpackhi_epi64(a01, a01); |
| 281 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); |
| 282 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); |
| 283 // d0 = (a0 + a1 + 7) >> 4; |
| 284 // d2 = (a0 - a1 + 7) >> 4; |
| 285 const __m128i d0 = _mm_srai_epi16(c0, 4); |
| 286 const __m128i d2 = _mm_srai_epi16(c2, 4); |
| 287 |
| 287 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); | 288 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); |
| 288 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); | 289 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); |
| 289 _mm_storeu_si128((__m128i*)&out[0], d0_g1); | 290 _mm_storeu_si128((__m128i*)&out[0], d0_g1); |
| 290 _mm_storeu_si128((__m128i*)&out[8], d2_f3); | 291 _mm_storeu_si128((__m128i*)&out[8], d2_f3); |
| 291 } | 292 } |
| 292 | 293 |
| 293 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | 294 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { |
| 294 const __m128i zero = _mm_setzero_si128(); | 295 const __m128i zero = _mm_setzero_si128(); |
| 295 // Load src. | 296 // Load src. |
| 296 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); | 297 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); |
| (...skipping 742 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1039 const __m128i e0 = _mm_madd_epi16(d0, d0); | 1040 const __m128i e0 = _mm_madd_epi16(d0, d0); |
| 1040 const __m128i e1 = _mm_madd_epi16(d1, d1); | 1041 const __m128i e1 = _mm_madd_epi16(d1, d1); |
| 1041 const __m128i sum = _mm_add_epi32(e0, e1); | 1042 const __m128i sum = _mm_add_epi32(e0, e1); |
| 1042 | 1043 |
| 1043 int32_t tmp[4]; | 1044 int32_t tmp[4]; |
| 1044 _mm_storeu_si128((__m128i*)tmp, sum); | 1045 _mm_storeu_si128((__m128i*)tmp, sum); |
| 1045 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); | 1046 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); |
| 1046 } | 1047 } |
| 1047 | 1048 |
| 1048 //------------------------------------------------------------------------------ | 1049 //------------------------------------------------------------------------------ |
| 1050 |
| 1051 static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { |
| 1052 const __m128i mask = _mm_set1_epi16(0x00ff); |
| 1053 const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); |
| 1054 const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); |
| 1055 const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); |
| 1056 const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); |
| 1057 const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte |
| 1058 const __m128i b1 = _mm_srli_epi16(a1, 8); |
| 1059 const __m128i b2 = _mm_srli_epi16(a2, 8); |
| 1060 const __m128i b3 = _mm_srli_epi16(a3, 8); |
| 1061 const __m128i c0 = _mm_and_si128(a0, mask); // lo byte |
| 1062 const __m128i c1 = _mm_and_si128(a1, mask); |
| 1063 const __m128i c2 = _mm_and_si128(a2, mask); |
| 1064 const __m128i c3 = _mm_and_si128(a3, mask); |
| 1065 const __m128i d0 = _mm_add_epi32(b0, c0); |
| 1066 const __m128i d1 = _mm_add_epi32(b1, c1); |
| 1067 const __m128i d2 = _mm_add_epi32(b2, c2); |
| 1068 const __m128i d3 = _mm_add_epi32(b3, c3); |
| 1069 const __m128i e0 = _mm_add_epi32(d0, d1); |
| 1070 const __m128i e1 = _mm_add_epi32(d2, d3); |
| 1071 const __m128i f0 = _mm_add_epi32(e0, e1); |
| 1072 uint16_t tmp[8]; |
| 1073 _mm_storeu_si128((__m128i*)tmp, f0); |
| 1074 dc[0] = tmp[0] + tmp[1]; |
| 1075 dc[1] = tmp[2] + tmp[3]; |
| 1076 dc[2] = tmp[4] + tmp[5]; |
| 1077 dc[3] = tmp[6] + tmp[7]; |
| 1078 } |
| 1079 |
| 1080 //------------------------------------------------------------------------------ |
| 1049 // Texture distortion | 1081 // Texture distortion |
| 1050 // | 1082 // |
| 1051 // We try to match the spectral content (weighted) between source and | 1083 // We try to match the spectral content (weighted) between source and |
| 1052 // reconstructed samples. | 1084 // reconstructed samples. |
| 1053 | 1085 |
| 1054 // Hadamard transform | 1086 // Hadamard transform |
| 1055 // Returns the weighted sum of the absolute value of transformed coefficients. | 1087 // Returns the weighted sum of the absolute value of transformed coefficients. |
| 1056 // w[] contains a row-major 4 by 4 symmetric matrix. | 1088 // w[] contains a row-major 4 by 4 symmetric matrix. |
| 1057 static int TTransform(const uint8_t* inA, const uint8_t* inB, | 1089 static int TTransform(const uint8_t* inA, const uint8_t* inB, |
| 1058 const uint16_t* const w) { | 1090 const uint16_t* const w) { |
| (...skipping 265 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1324 VP8ITransform = ITransform; | 1356 VP8ITransform = ITransform; |
| 1325 VP8FTransform = FTransform; | 1357 VP8FTransform = FTransform; |
| 1326 VP8FTransform2 = FTransform2; | 1358 VP8FTransform2 = FTransform2; |
| 1327 VP8FTransformWHT = FTransformWHT; | 1359 VP8FTransformWHT = FTransformWHT; |
| 1328 VP8SSE16x16 = SSE16x16; | 1360 VP8SSE16x16 = SSE16x16; |
| 1329 VP8SSE16x8 = SSE16x8; | 1361 VP8SSE16x8 = SSE16x8; |
| 1330 VP8SSE8x8 = SSE8x8; | 1362 VP8SSE8x8 = SSE8x8; |
| 1331 VP8SSE4x4 = SSE4x4; | 1363 VP8SSE4x4 = SSE4x4; |
| 1332 VP8TDisto4x4 = Disto4x4; | 1364 VP8TDisto4x4 = Disto4x4; |
| 1333 VP8TDisto16x16 = Disto16x16; | 1365 VP8TDisto16x16 = Disto16x16; |
| 1366 VP8Mean16x4 = Mean16x4; |
| 1367 } |
| 1368 |
| 1369 //------------------------------------------------------------------------------ |
| 1370 // SSIM / PSNR entry point (TODO(skal): move to its own file later) |
| 1371 |
| 1372 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, |
| 1373 const uint8_t* src2, int len) { |
| 1374 int i = 0; |
| 1375 uint32_t sse2 = 0; |
| 1376 if (len >= 16) { |
| 1377 const int limit = len - 32; |
| 1378 int32_t tmp[4]; |
| 1379 __m128i sum1; |
| 1380 __m128i sum = _mm_setzero_si128(); |
| 1381 __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); |
| 1382 __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); |
| 1383 i += 16; |
| 1384 while (i <= limit) { |
| 1385 const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); |
| 1386 const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); |
| 1387 __m128i sum2; |
| 1388 i += 16; |
| 1389 SubtractAndAccumulate(a0, b0, &sum1); |
| 1390 sum = _mm_add_epi32(sum, sum1); |
| 1391 a0 = _mm_loadu_si128((const __m128i*)&src1[i]); |
| 1392 b0 = _mm_loadu_si128((const __m128i*)&src2[i]); |
| 1393 i += 16; |
| 1394 SubtractAndAccumulate(a1, b1, &sum2); |
| 1395 sum = _mm_add_epi32(sum, sum2); |
| 1396 } |
| 1397 SubtractAndAccumulate(a0, b0, &sum1); |
| 1398 sum = _mm_add_epi32(sum, sum1); |
| 1399 _mm_storeu_si128((__m128i*)tmp, sum); |
| 1400 sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); |
| 1401 } |
| 1402 |
| 1403 for (; i < len; ++i) { |
| 1404 const int32_t diff = src1[i] - src2[i]; |
| 1405 sse2 += diff * diff; |
| 1406 } |
| 1407 return sse2; |
| 1408 } |
| 1409 |
| 1410 static uint32_t HorizontalAdd16b(const __m128i* const m) { |
| 1411 uint16_t tmp[8]; |
| 1412 const __m128i a = _mm_srli_si128(*m, 8); |
| 1413 const __m128i b = _mm_add_epi16(*m, a); |
| 1414 _mm_storeu_si128((__m128i*)tmp, b); |
| 1415 return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; |
| 1416 } |
| 1417 |
| 1418 static uint32_t HorizontalAdd32b(const __m128i* const m) { |
| 1419 const __m128i a = _mm_srli_si128(*m, 8); |
| 1420 const __m128i b = _mm_add_epi32(*m, a); |
| 1421 const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); |
| 1422 return (uint32_t)_mm_cvtsi128_si32(c); |
| 1423 } |
| 1424 |
| 1425 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; |
| 1426 |
| 1427 #define ACCUMULATE_ROW(WEIGHT) do { \ |
| 1428 /* compute row weight (Wx * Wy) */ \ |
| 1429 const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ |
| 1430 const __m128i W = _mm_mullo_epi16(Wx, Wy); \ |
| 1431 /* process 8 bytes at a time (7 bytes, actually) */ \ |
| 1432 const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ |
| 1433 const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ |
| 1434 /* convert to 16b and multiply by weight */ \ |
| 1435 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ |
| 1436 const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ |
| 1437 const __m128i wa1 = _mm_mullo_epi16(a1, W); \ |
| 1438 const __m128i wb1 = _mm_mullo_epi16(b1, W); \ |
| 1439 /* accumulate */ \ |
| 1440 xm = _mm_add_epi16(xm, wa1); \ |
| 1441 ym = _mm_add_epi16(ym, wb1); \ |
| 1442 xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ |
| 1443 xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ |
| 1444 yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ |
| 1445 src1 += stride1; \ |
| 1446 src2 += stride2; \ |
| 1447 } while (0) |
| 1448 |
| 1449 static double SSIMGet_SSE2(const uint8_t* src1, int stride1, |
| 1450 const uint8_t* src2, int stride2) { |
| 1451 VP8DistoStats stats; |
| 1452 const __m128i zero = _mm_setzero_si128(); |
| 1453 __m128i xm = zero, ym = zero; // 16b accums |
| 1454 __m128i xxm = zero, yym = zero, xym = zero; // 32b accum |
| 1455 const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); |
| 1456 assert(2 * VP8_SSIM_KERNEL + 1 == 7); |
| 1457 ACCUMULATE_ROW(1); |
| 1458 ACCUMULATE_ROW(2); |
| 1459 ACCUMULATE_ROW(3); |
| 1460 ACCUMULATE_ROW(4); |
| 1461 ACCUMULATE_ROW(3); |
| 1462 ACCUMULATE_ROW(2); |
| 1463 ACCUMULATE_ROW(1); |
| 1464 stats.xm = HorizontalAdd16b(&xm); |
| 1465 stats.ym = HorizontalAdd16b(&ym); |
| 1466 stats.xxm = HorizontalAdd32b(&xxm); |
| 1467 stats.xym = HorizontalAdd32b(&xym); |
| 1468 stats.yym = HorizontalAdd32b(&yym); |
| 1469 return VP8SSIMFromStats(&stats); |
| 1470 } |
| 1471 |
| 1472 extern void VP8SSIMDspInitSSE2(void); |
| 1473 |
| 1474 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { |
| 1475 VP8AccumulateSSE = AccumulateSSE_SSE2; |
| 1476 VP8SSIMGet = SSIMGet_SSE2; |
| 1334 } | 1477 } |
| 1335 | 1478 |
| 1336 #else // !WEBP_USE_SSE2 | 1479 #else // !WEBP_USE_SSE2 |
| 1337 | 1480 |
| 1338 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) | 1481 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) |
| 1482 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) |
| 1339 | 1483 |
| 1340 #endif // WEBP_USE_SSE2 | 1484 #endif // WEBP_USE_SSE2 |
| OLD | NEW |