| Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c	(revision 293588)
 | 
| +++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c	(working copy)
 | 
| @@ -312,9 +312,11 @@
 | 
|                                           unsigned int out_pitch,
 | 
|                                           unsigned int output_height,
 | 
|                                           int16_t *filter) {
 | 
| -  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
 | 
| +  __m128i addFilterReg64, filtersReg, minReg;
 | 
|    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
 | 
| -  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
 | 
| +  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
 | 
| +  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
 | 
| +  __m128i srcReg8;
 | 
|    unsigned int i;
 | 
|  
 | 
|    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
 | 
| @@ -333,27 +335,26 @@
 | 
|    // duplicate only the forth 16 bits in the filter
 | 
|    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 | 
|  
 | 
| +  // load the first 7 rows of 8 bytes
 | 
| +  srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
 | 
| +  srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
 | 
| +  srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
 | 
| +  srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
 | 
| +  srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
 | 
| +  srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
 | 
| +  srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
 | 
| +
 | 
|    for (i = 0; i < output_height; i++) {
 | 
| -    // load the first 8 bytes
 | 
| -    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
 | 
| -    // load the next 8 bytes in stride of src_pitch
 | 
| -    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
 | 
| -    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
 | 
| -    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
 | 
| +    // load the last 8 bytes
 | 
| +    srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
 | 
|  
 | 
|      // merge the result together
 | 
| -    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
 | 
| -    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
 | 
| +    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
 | 
| +    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
 | 
|  
 | 
| -    // load the next 8 bytes in stride of src_pitch
 | 
| -    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
 | 
| -    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
 | 
| -    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
 | 
| -    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
 | 
| -
 | 
|      // merge the result together
 | 
| -    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
 | 
| -    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
 | 
| +    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
 | 
| +    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
 | 
|  
 | 
|      // multiply 2 adjacent elements with the filter and add the result
 | 
|      srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
 | 
| @@ -377,6 +378,15 @@
 | 
|  
 | 
|      src_ptr+=src_pitch;
 | 
|  
 | 
| +    // shift down a row
 | 
| +    srcReg1 = srcReg2;
 | 
| +    srcReg2 = srcReg3;
 | 
| +    srcReg3 = srcReg4;
 | 
| +    srcReg4 = srcReg5;
 | 
| +    srcReg5 = srcReg6;
 | 
| +    srcReg6 = srcReg7;
 | 
| +    srcReg7 = srcReg8;
 | 
| +
 | 
|      // save only 8 bytes convolve result
 | 
|      _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
 | 
|  
 | 
| @@ -390,9 +400,11 @@
 | 
|                                            unsigned int out_pitch,
 | 
|                                            unsigned int output_height,
 | 
|                                            int16_t *filter) {
 | 
| -  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
 | 
| +  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
 | 
|    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
 | 
| -  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
 | 
| +  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
 | 
| +  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
 | 
| +  __m128i srcReg8;
 | 
|    unsigned int i;
 | 
|  
 | 
|    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
 | 
| @@ -411,19 +423,24 @@
 | 
|    // duplicate only the forth 16 bits in the filter
 | 
|    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 | 
|  
 | 
| +  // load the first 7 rows of 16 bytes
 | 
| +  srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
 | 
| +  srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
 | 
| +  srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
 | 
| +  srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
 | 
| +  srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
 | 
| +  srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
 | 
| +  srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
 | 
| +
 | 
|    for (i = 0; i < output_height; i++) {
 | 
| -    // load the first 16 bytes
 | 
| -    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
 | 
| -    // load the next 16 bytes in stride of src_pitch
 | 
| -    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
 | 
| -    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
 | 
| -    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
 | 
| +    // load the last 16 bytes
 | 
| +    srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
 | 
|  
 | 
|      // merge the result together
 | 
| -    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
 | 
| -    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
 | 
| -    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
 | 
| -    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
 | 
| +    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
 | 
| +    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
 | 
| +    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
 | 
| +    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
 | 
|  
 | 
|      // multiply 2 adjacent elements with the filter and add the result
 | 
|      srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
 | 
| @@ -435,25 +452,17 @@
 | 
|      srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
 | 
|      srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
 | 
|  
 | 
| -    // load the next 16 bytes in stride of two/three src_pitch
 | 
| -    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
 | 
| -    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
 | 
| -
 | 
|      // merge the result together
 | 
| -    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
 | 
| -    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
 | 
| +    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
 | 
| +    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
 | 
|  
 | 
|      // multiply 2 adjacent elements with the filter and add the result
 | 
| -    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
 | 
| +    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
 | 
|      srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
 | 
|  
 | 
| -    // load the next 16 bytes in stride of four/five src_pitch
 | 
| -    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
 | 
| -    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
 | 
| -
 | 
|      // merge the result together
 | 
| -    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
 | 
| -    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
 | 
| +    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
 | 
| +    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
 | 
|  
 | 
|      // multiply 2 adjacent elements with the filter and add the result
 | 
|      srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
 | 
| @@ -461,13 +470,13 @@
 | 
|  
 | 
|      // add and saturate the results together
 | 
|      srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
 | 
| -                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
 | 
| +                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
 | 
|      srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
 | 
|                                   _mm_min_epi16(srcRegFilt6, srcRegFilt8));
 | 
|  
 | 
|      // add and saturate the results together
 | 
|      srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
 | 
| -                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
 | 
| +                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
 | 
|      srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
 | 
|                                   _mm_max_epi16(srcRegFilt6, srcRegFilt8));
 | 
|      srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
 | 
| @@ -484,6 +493,15 @@
 | 
|  
 | 
|      src_ptr+=src_pitch;
 | 
|  
 | 
| +    // shift down a row
 | 
| +    srcReg1 = srcReg2;
 | 
| +    srcReg2 = srcReg3;
 | 
| +    srcReg3 = srcReg4;
 | 
| +    srcReg4 = srcReg5;
 | 
| +    srcReg5 = srcReg6;
 | 
| +    srcReg6 = srcReg7;
 | 
| +    srcReg7 = srcReg8;
 | 
| +
 | 
|      // save 16 bytes convolve result
 | 
|      _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
 | 
|  
 | 
| 
 |