Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (revision 291087) |
+++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (working copy) |
@@ -307,7 +307,7 @@ |
__m256i addFilterReg64; |
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; |
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; |
- __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; |
+ __m256i srcReg32b11, srcReg32b12, filtersReg32; |
__m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
unsigned int i; |
unsigned int src_stride, dst_stride; |
@@ -409,36 +409,36 @@ |
// multiply 2 adjacent elements with the filter and add the result |
srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); |
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); |
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); |
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); |
// add and saturate the results together |
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); |
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); |
- |
// multiply 2 adjacent elements with the filter and add the result |
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); |
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); |
- |
- // multiply 2 adjacent elements with the filter and add the result |
srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); |
- srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); |
- |
// add and saturate the results together |
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
_mm256_min_epi16(srcReg32b8, srcReg32b12)); |
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
- _mm256_min_epi16(srcReg32b6, srcReg32b13)); |
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
+ _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
+ // multiply 2 adjacent elements with the filter and add the result |
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); |
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); |
+ |
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); |
+ |
+ // multiply 2 adjacent elements with the filter and add the result |
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); |
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); |
+ |
// add and saturate the results together |
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
- _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
- _mm256_max_epi16(srcReg32b6, srcReg32b13)); |
+ _mm256_min_epi16(srcReg32b8, srcReg32b12)); |
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
+ _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
- |
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); |
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); |