| Index: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
 | 
| diff --git a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
 | 
| index 0a105629f067478a5754101879b38364e07af345..f49949940395e9ef29ec8c76470e4122c843879f 100644
 | 
| --- a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
 | 
| +++ b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
 | 
| @@ -90,6 +90,17 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
 | 
|    s0 = _mm_adds_epu16(s0, t0);
 | 
|    s1 = _mm_adds_epu16(s1, t1);
 | 
|  
 | 
| +  if (height == 64) {
 | 
| +    s0 = _mm_srai_epi16(s0, 5);
 | 
| +    s1 = _mm_srai_epi16(s1, 5);
 | 
| +  } else if (height == 32) {
 | 
| +    s0 = _mm_srai_epi16(s0, 4);
 | 
| +    s1 = _mm_srai_epi16(s1, 4);
 | 
| +  } else {
 | 
| +    s0 = _mm_srai_epi16(s0, 3);
 | 
| +    s1 = _mm_srai_epi16(s1, 3);
 | 
| +  }
 | 
| +
 | 
|    _mm_store_si128((__m128i *)hbuf, s0);
 | 
|    hbuf += 8;
 | 
|    _mm_store_si128((__m128i *)hbuf, s1);
 | 
| @@ -100,73 +111,62 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
 | 
|    __m128i src_line = _mm_load_si128((const __m128i *)ref);
 | 
|    __m128i s0 = _mm_sad_epu8(src_line, zero);
 | 
|    __m128i s1;
 | 
| -  (void) width;  // width = 64
 | 
| -
 | 
| -  ref += 16;
 | 
| -  src_line = _mm_load_si128((const __m128i *)ref);
 | 
| -  s1 = _mm_sad_epu8(src_line, zero);
 | 
| -  s0 = _mm_adds_epu16(s0, s1);
 | 
| +  int i;
 | 
| +  const int norm_factor = 3 + (width >> 5);
 | 
|  
 | 
| -  ref += 16;
 | 
| -  src_line = _mm_load_si128((const __m128i *)ref);
 | 
| -  s1 = _mm_sad_epu8(src_line, zero);
 | 
| -  s0 = _mm_adds_epu16(s0, s1);
 | 
| -
 | 
| -  ref += 16;
 | 
| -  src_line = _mm_load_si128((const __m128i *)ref);
 | 
| -  s1 = _mm_sad_epu8(src_line, zero);
 | 
| -  s0 = _mm_adds_epu16(s0, s1);
 | 
| +  for (i = 16; i < width; i += 16) {
 | 
| +    ref += 16;
 | 
| +    src_line = _mm_load_si128((const __m128i *)ref);
 | 
| +    s1 = _mm_sad_epu8(src_line, zero);
 | 
| +    s0 = _mm_adds_epu16(s0, s1);
 | 
| +  }
 | 
|  
 | 
|    s1 = _mm_srli_si128(s0, 8);
 | 
|    s0 = _mm_adds_epu16(s0, s1);
 | 
|  
 | 
| -  return _mm_extract_epi16(s0, 0);
 | 
| +  return _mm_extract_epi16(s0, 0) >> norm_factor;
 | 
|  }
 | 
|  
 | 
| -int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
 | 
| -                        const int width) {
 | 
| +int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
 | 
| +                        const int bwl) {
 | 
|    int idx;
 | 
| -  __m128i zero = _mm_setzero_si128();
 | 
| -  __m128i sum;
 | 
| +  int width = 4 << bwl;
 | 
| +  int16_t mean;
 | 
|    __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
 | 
|    __m128i v1 = _mm_load_si128((const __m128i *)src);
 | 
|    __m128i diff = _mm_subs_epi16(v0, v1);
 | 
| -  __m128i sign = _mm_srai_epi16(diff, 15);
 | 
| -
 | 
| -  diff = _mm_xor_si128(diff, sign);
 | 
| -  sum = _mm_sub_epi16(diff, sign);
 | 
| -
 | 
| -  (void) width;  // width = 64;
 | 
| +  __m128i sum = diff;
 | 
| +  __m128i sse = _mm_madd_epi16(diff, diff);
 | 
|  
 | 
|    ref += 8;
 | 
|    src += 8;
 | 
|  
 | 
| -  v0 = _mm_unpacklo_epi16(sum, zero);
 | 
| -  v1 = _mm_unpackhi_epi16(sum, zero);
 | 
| -  sum = _mm_add_epi32(v0, v1);
 | 
| -
 | 
| -  for (idx = 1; idx < 8; ++idx) {
 | 
| +  for (idx = 8; idx < width; idx += 8) {
 | 
|      v0 = _mm_loadu_si128((const __m128i *)ref);
 | 
|      v1 = _mm_load_si128((const __m128i *)src);
 | 
|      diff = _mm_subs_epi16(v0, v1);
 | 
| -    sign = _mm_srai_epi16(diff, 15);
 | 
| -    diff = _mm_xor_si128(diff, sign);
 | 
| -    diff = _mm_sub_epi16(diff, sign);
 | 
| -
 | 
| -    v0 = _mm_unpacklo_epi16(diff, zero);
 | 
| -    v1 = _mm_unpackhi_epi16(diff, zero);
 | 
|  
 | 
| -    sum = _mm_add_epi32(sum, v0);
 | 
| -    sum = _mm_add_epi32(sum, v1);
 | 
| +    sum = _mm_add_epi16(sum, diff);
 | 
| +    v0  = _mm_madd_epi16(diff, diff);
 | 
| +    sse = _mm_add_epi32(sse, v0);
 | 
|  
 | 
|      ref += 8;
 | 
|      src += 8;
 | 
|    }
 | 
|  
 | 
| -  v0 = _mm_srli_si128(sum, 8);
 | 
| -  sum = _mm_add_epi32(sum, v0);
 | 
| -  v0 = _mm_srli_epi64(sum, 32);
 | 
| -  sum = _mm_add_epi32(sum, v0);
 | 
| +  v0  = _mm_srli_si128(sum, 8);
 | 
| +  sum = _mm_add_epi16(sum, v0);
 | 
| +  v0  = _mm_srli_epi64(sum, 32);
 | 
| +  sum = _mm_add_epi16(sum, v0);
 | 
| +  v0  = _mm_srli_epi32(sum, 16);
 | 
| +  sum = _mm_add_epi16(sum, v0);
 | 
| +
 | 
| +  v1  = _mm_srli_si128(sse, 8);
 | 
| +  sse = _mm_add_epi32(sse, v1);
 | 
| +  v1  = _mm_srli_epi64(sse, 32);
 | 
| +  sse = _mm_add_epi32(sse, v1);
 | 
| +
 | 
| +  mean = _mm_extract_epi16(sum, 0);
 | 
|  
 | 
| -  return _mm_cvtsi128_si32(sum);
 | 
| +  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
 | 
|  }
 | 
| 
 |