OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 #include <immintrin.h> // AVX2 | 10 #include <immintrin.h> // AVX2 |
11 #include "vpx/vpx_integer.h" | 11 #include "vpx/vpx_integer.h" |
12 | 12 |
13 void vp9_sad32x32x4d_avx2(uint8_t *src, | 13 void vpx_sad32x32x4d_avx2(uint8_t *src, |
14 int src_stride, | 14 int src_stride, |
15 uint8_t *ref[4], | 15 uint8_t *ref[4], |
16 int ref_stride, | 16 int ref_stride, |
17 unsigned int res[4]) { | 17 uint32_t res[4]) { |
18 __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; | 18 __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; |
19 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; | 19 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; |
20 __m256i sum_mlow, sum_mhigh; | 20 __m256i sum_mlow, sum_mhigh; |
21 int i; | 21 int i; |
22 uint8_t *ref0, *ref1, *ref2, *ref3; | 22 uint8_t *ref0, *ref1, *ref2, *ref3; |
23 | 23 |
24 ref0 = ref[0]; | 24 ref0 = ref[0]; |
25 ref1 = ref[1]; | 25 ref1 = ref[1]; |
26 ref2 = ref[2]; | 26 ref2 = ref[2]; |
27 ref3 = ref[3]; | 27 ref3 = ref[3]; |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
73 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 73 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
74 | 74 |
75 // add the low 128 bit to the high 128 bit | 75 // add the low 128 bit to the high 128 bit |
76 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 76 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
77 _mm256_extractf128_si256(sum_mlow, 1)); | 77 _mm256_extractf128_si256(sum_mlow, 1)); |
78 | 78 |
79 _mm_storeu_si128((__m128i *)(res), sum); | 79 _mm_storeu_si128((__m128i *)(res), sum); |
80 } | 80 } |
81 } | 81 } |
82 | 82 |
83 void vp9_sad64x64x4d_avx2(uint8_t *src, | 83 void vpx_sad64x64x4d_avx2(uint8_t *src, |
84 int src_stride, | 84 int src_stride, |
85 uint8_t *ref[4], | 85 uint8_t *ref[4], |
86 int ref_stride, | 86 int ref_stride, |
87 unsigned int res[4]) { | 87 uint32_t res[4]) { |
88 __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; | 88 __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; |
89 __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; | 89 __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; |
90 __m256i ref3_reg, ref3next_reg; | 90 __m256i ref3_reg, ref3next_reg; |
91 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; | 91 __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; |
92 __m256i sum_mlow, sum_mhigh; | 92 __m256i sum_mlow, sum_mhigh; |
93 int i; | 93 int i; |
94 uint8_t *ref0, *ref1, *ref2, *ref3; | 94 uint8_t *ref0, *ref1, *ref2, *ref3; |
95 | 95 |
96 ref0 = ref[0]; | 96 ref0 = ref[0]; |
97 ref1 = ref[1]; | 97 ref1 = ref[1]; |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
158 // add the low 64 bit to the high 64 bit | 158 // add the low 64 bit to the high 64 bit |
159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
160 | 160 |
161 // add the low 128 bit to the high 128 bit | 161 // add the low 128 bit to the high 128 bit |
162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
163 _mm256_extractf128_si256(sum_mlow, 1)); | 163 _mm256_extractf128_si256(sum_mlow, 1)); |
164 | 164 |
165 _mm_storeu_si128((__m128i *)(res), sum); | 165 _mm_storeu_si128((__m128i *)(res), sum); |
166 } | 166 } |
167 } | 167 } |
OLD | NEW |