| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 #include <immintrin.h> // AVX2 | 10 #include <immintrin.h> // AVX2 |
| 11 #include "vpx/vpx_integer.h" | 11 #include "vpx/vpx_integer.h" |
| 12 | 12 |
| 13 void vp9_sad32x32x4d_avx2(uint8_t *src, | 13 void vpx_sad32x32x4d_avx2(uint8_t *src, |
| 14 int src_stride, | 14 int src_stride, |
| 15 uint8_t *ref[4], | 15 uint8_t *ref[4], |
| 16 int ref_stride, | 16 int ref_stride, |
| 17 unsigned int res[4]) { | 17 uint32_t res[4]) { |
| 18 __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; | 18 __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; |
| 19 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; | 19 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; |
| 20 __m256i sum_mlow, sum_mhigh; | 20 __m256i sum_mlow, sum_mhigh; |
| 21 int i; | 21 int i; |
| 22 uint8_t *ref0, *ref1, *ref2, *ref3; | 22 uint8_t *ref0, *ref1, *ref2, *ref3; |
| 23 | 23 |
| 24 ref0 = ref[0]; | 24 ref0 = ref[0]; |
| 25 ref1 = ref[1]; | 25 ref1 = ref[1]; |
| 26 ref2 = ref[2]; | 26 ref2 = ref[2]; |
| 27 ref3 = ref[3]; | 27 ref3 = ref[3]; |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 73 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 73 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
| 74 | 74 |
| 75 // add the low 128 bit to the high 128 bit | 75 // add the low 128 bit to the high 128 bit |
| 76 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 76 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
| 77 _mm256_extractf128_si256(sum_mlow, 1)); | 77 _mm256_extractf128_si256(sum_mlow, 1)); |
| 78 | 78 |
| 79 _mm_storeu_si128((__m128i *)(res), sum); | 79 _mm_storeu_si128((__m128i *)(res), sum); |
| 80 } | 80 } |
| 81 } | 81 } |
| 82 | 82 |
| 83 void vp9_sad64x64x4d_avx2(uint8_t *src, | 83 void vpx_sad64x64x4d_avx2(uint8_t *src, |
| 84 int src_stride, | 84 int src_stride, |
| 85 uint8_t *ref[4], | 85 uint8_t *ref[4], |
| 86 int ref_stride, | 86 int ref_stride, |
| 87 unsigned int res[4]) { | 87 uint32_t res[4]) { |
| 88 __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; | 88 __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; |
| 89 __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; | 89 __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; |
| 90 __m256i ref3_reg, ref3next_reg; | 90 __m256i ref3_reg, ref3next_reg; |
| 91 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; | 91 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; |
| 92 __m256i sum_mlow, sum_mhigh; | 92 __m256i sum_mlow, sum_mhigh; |
| 93 int i; | 93 int i; |
| 94 uint8_t *ref0, *ref1, *ref2, *ref3; | 94 uint8_t *ref0, *ref1, *ref2, *ref3; |
| 95 | 95 |
| 96 ref0 = ref[0]; | 96 ref0 = ref[0]; |
| 97 ref1 = ref[1]; | 97 ref1 = ref[1]; |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 158 // add the low 64 bit to the high 64 bit | 158 // add the low 64 bit to the high 64 bit |
| 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
| 160 | 160 |
| 161 // add the low 128 bit to the high 128 bit | 161 // add the low 128 bit to the high 128 bit |
| 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
| 163 _mm256_extractf128_si256(sum_mlow, 1)); | 163 _mm256_extractf128_si256(sum_mlow, 1)); |
| 164 | 164 |
| 165 _mm_storeu_si128((__m128i *)(res), sum); | 165 _mm_storeu_si128((__m128i *)(res), sum); |
| 166 } | 166 } |
| 167 } | 167 } |
| OLD | NEW |