Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Unified Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
===================================================================
--- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c (revision 278778)
+++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c (working copy)
@@ -44,7 +44,7 @@
unsigned int output_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
unsigned int i;
@@ -61,20 +61,22 @@
// duplicate only the third 16 bit in the filter into the first lane
secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
// duplicate only the seconds 16 bits in the filter into the second lane
+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
// duplicate only the forth 16 bits in the filter into the second lane
+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
// loading the local filters
- thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
- forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+ shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+ shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
// filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -164,12 +166,12 @@
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
// add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+ srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits
@@ -229,21 +231,21 @@
// filter the source buffer
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
// filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
@@ -260,21 +262,21 @@
// filter the source buffer
srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
// filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+ srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698