| Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (revision 278778)
|
| +++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c (working copy)
|
| @@ -111,21 +111,21 @@
|
|
|
| // filter the source buffer
|
| srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
|
| - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
|
| + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
|
| - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
|
| + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
|
|
|
| // add and saturate the results together
|
| srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
|
|
|
| // filter the source buffer
|
| - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
|
| + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
|
| srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
|
| + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
|
| srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
|
|
|
| // add and saturate the results together
|
| @@ -146,21 +146,21 @@
|
|
|
| // filter the source buffer
|
| srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
|
| - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
|
| + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
|
| - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
|
| + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
|
|
|
| // add and saturate the results together
|
| srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
|
|
|
| // filter the source buffer
|
| - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
|
| + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
|
| srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
|
| + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
|
| srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
|
|
|
| // add and saturate the results together
|
| @@ -208,26 +208,26 @@
|
| srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
|
| _mm256_castsi256_si128(filt1Reg));
|
| srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
|
| - _mm256_castsi256_si128(filt2Reg));
|
| + _mm256_castsi256_si128(filt4Reg));
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
|
| _mm256_castsi256_si128(firstFilters));
|
| srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
| - _mm256_castsi256_si128(secondFilters));
|
| + _mm256_castsi256_si128(forthFilters));
|
|
|
| // add and saturate the results together
|
| srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
|
|
|
| // filter the source buffer
|
| srcRegFilt3= _mm_shuffle_epi8(srcReg1,
|
| - _mm256_castsi256_si128(filt4Reg));
|
| + _mm256_castsi256_si128(filt2Reg));
|
| srcRegFilt2= _mm_shuffle_epi8(srcReg1,
|
| _mm256_castsi256_si128(filt3Reg));
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
|
| - _mm256_castsi256_si128(forthFilters));
|
| + _mm256_castsi256_si128(secondFilters));
|
| srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
| _mm256_castsi256_si128(thirdFilters));
|
|
|
| @@ -247,26 +247,26 @@
|
| srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
|
| _mm256_castsi256_si128(filt1Reg));
|
| srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
|
| - _mm256_castsi256_si128(filt2Reg));
|
| + _mm256_castsi256_si128(filt4Reg));
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
|
| _mm256_castsi256_si128(firstFilters));
|
| srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
| - _mm256_castsi256_si128(secondFilters));
|
| + _mm256_castsi256_si128(forthFilters));
|
|
|
| // add and saturate the results together
|
| srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
|
|
|
| // filter the source buffer
|
| srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
|
| - _mm256_castsi256_si128(filt4Reg));
|
| + _mm256_castsi256_si128(filt2Reg));
|
| srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
|
| _mm256_castsi256_si128(filt3Reg));
|
|
|
| // multiply 2 adjacent elements with the filter and add the result
|
| srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
|
| - _mm256_castsi256_si128(forthFilters));
|
| + _mm256_castsi256_si128(secondFilters));
|
| srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
| _mm256_castsi256_si128(thirdFilters));
|
|
|
|
|