source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c - Issue 341293003: libvpx: Pull from upstream

Unified Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

===================================================================

--- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c (revision 278778)

+++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c (working copy)

@@ -44,7 +44,7 @@

unsigned int output_pitch,

unsigned int output_height,

int16_t *filter) {

- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;

__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;

__m128i addFilterReg64, filtersReg, srcReg, minReg;

unsigned int i;

@@ -61,20 +61,22 @@

// duplicate only the third 16 bit in the filter into the first lane

secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);

// duplicate only the seconds 16 bits in the filter into the second lane

+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3

firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);

// duplicate only the forth 16 bits in the filter into the second lane

+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7

secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);

// loading the local filters

- thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);

- forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);

+ shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);

+ shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);

for (i = 0; i < output_height; i++) {

srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));

// filter the source buffer

- srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);

- srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);

+ srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);

+ srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);

// multiply 2 adjacent elements with the filter and add the result

srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);

@@ -164,12 +166,12 @@

srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);

// add and saturate all the results together

- minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);

- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);

+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);

+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

- srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);

+ srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);

srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);

- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);

srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

// shift by 7 bit each 16 bits

@@ -229,21 +231,21 @@

// filter the source buffer

srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);

- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);

+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);

// multiply 2 adjacent elements with the filter and add the result

srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);

- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

// add and saturate the results together

srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

// filter the source buffer

- srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);

+ srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);

srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);

// multiply 2 adjacent elements with the filter and add the result

- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);

+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);

srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

// add and saturate the results together

@@ -260,21 +262,21 @@

// filter the source buffer

srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);

- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);

+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);

// multiply 2 adjacent elements with the filter and add the result

srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);

- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

// add and saturate the results together

srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

// filter the source buffer

- srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);

+ srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);

srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);

// multiply 2 adjacent elements with the filter and add the result

- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);

+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);

srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

// add and saturate the results together