Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (revision 278778) |
+++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (working copy) |
@@ -111,21 +111,21 @@ |
// filter the source buffer |
srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); |
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
// add and saturate the results together |
srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
// filter the source buffer |
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
// multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); |
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
// add and saturate the results together |
@@ -146,21 +146,21 @@ |
// filter the source buffer |
srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); |
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); |
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); |
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
// add and saturate the results together |
srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); |
// filter the source buffer |
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); |
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); |
// multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); |
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
// add and saturate the results together |
@@ -208,26 +208,26 @@ |
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
_mm256_castsi256_si128(filt1Reg)); |
srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
- _mm256_castsi256_si128(filt2Reg)); |
+ _mm256_castsi256_si128(filt4Reg)); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
_mm256_castsi256_si128(firstFilters)); |
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
- _mm256_castsi256_si128(secondFilters)); |
+ _mm256_castsi256_si128(forthFilters)); |
// add and saturate the results together |
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
// filter the source buffer |
srcRegFilt3= _mm_shuffle_epi8(srcReg1, |
- _mm256_castsi256_si128(filt4Reg)); |
+ _mm256_castsi256_si128(filt2Reg)); |
srcRegFilt2= _mm_shuffle_epi8(srcReg1, |
_mm256_castsi256_si128(filt3Reg)); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
- _mm256_castsi256_si128(forthFilters)); |
+ _mm256_castsi256_si128(secondFilters)); |
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
_mm256_castsi256_si128(thirdFilters)); |
@@ -247,26 +247,26 @@ |
srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
_mm256_castsi256_si128(filt1Reg)); |
srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
- _mm256_castsi256_si128(filt2Reg)); |
+ _mm256_castsi256_si128(filt4Reg)); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, |
_mm256_castsi256_si128(firstFilters)); |
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
- _mm256_castsi256_si128(secondFilters)); |
+ _mm256_castsi256_si128(forthFilters)); |
// add and saturate the results together |
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
// filter the source buffer |
srcRegFilt3 = _mm_shuffle_epi8(srcReg2, |
- _mm256_castsi256_si128(filt4Reg)); |
+ _mm256_castsi256_si128(filt2Reg)); |
srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
_mm256_castsi256_si128(filt3Reg)); |
// multiply 2 adjacent elements with the filter and add the result |
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
- _mm256_castsi256_si128(forthFilters)); |
+ _mm256_castsi256_si128(secondFilters)); |
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
_mm256_castsi256_si128(thirdFilters)); |