| Index: source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
 | 
| diff --git a/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
 | 
| index 7e63f389eade6def507477c56b44b2fabe1c251e..4bd8ac4a315541280a804fa93de052d33cb73a1e 100644
 | 
| --- a/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
 | 
| +++ b/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
 | 
| @@ -15,24 +15,38 @@
 | 
|  #include "vpx_ports/emmintrin_compat.h"
 | 
|  
 | 
|  static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 | 
| -    __m128i ubounded;
 | 
| -    __m128i lbounded;
 | 
| -    __m128i retval;
 | 
| -
 | 
| -    const __m128i zero = _mm_set1_epi16(0);
 | 
| -    const __m128i one = _mm_set1_epi16(1);
 | 
| -    const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
 | 
| -    const __m128i max = _mm_subs_epi16(
 | 
| -        _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
 | 
| -    const __m128i min = _mm_subs_epi16(zero, t80);
 | 
| -    ubounded = _mm_cmpgt_epi16(value, max);
 | 
| -    lbounded = _mm_cmplt_epi16(value, min);
 | 
| -    retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
 | 
| -    ubounded = _mm_and_si128(ubounded, max);
 | 
| -    lbounded = _mm_and_si128(lbounded, min);
 | 
| -    retval = _mm_or_si128(retval, ubounded);
 | 
| -    retval = _mm_or_si128(retval, lbounded);
 | 
| -    return retval;
 | 
| +  __m128i ubounded;
 | 
| +  __m128i lbounded;
 | 
| +  __m128i retval;
 | 
| +
 | 
| +  const __m128i zero = _mm_set1_epi16(0);
 | 
| +  const __m128i one = _mm_set1_epi16(1);
 | 
| +  __m128i t80, max, min;
 | 
| +
 | 
| +  if (bd == 8) {
 | 
| +    t80 = _mm_set1_epi16(0x80);
 | 
| +    max = _mm_subs_epi16(
 | 
| +              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
 | 
| +  } else if (bd == 10) {
 | 
| +    t80 = _mm_set1_epi16(0x200);
 | 
| +    max = _mm_subs_epi16(
 | 
| +              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
 | 
| +  } else {  // bd == 12
 | 
| +    t80 = _mm_set1_epi16(0x800);
 | 
| +    max = _mm_subs_epi16(
 | 
| +              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
 | 
| +  }
 | 
| +
 | 
| +  min = _mm_subs_epi16(zero, t80);
 | 
| +
 | 
| +  ubounded = _mm_cmpgt_epi16(value, max);
 | 
| +  lbounded = _mm_cmplt_epi16(value, min);
 | 
| +  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
 | 
| +  ubounded = _mm_and_si128(ubounded, max);
 | 
| +  lbounded = _mm_and_si128(lbounded, min);
 | 
| +  retval = _mm_or_si128(retval, ubounded);
 | 
| +  retval = _mm_or_si128(retval, lbounded);
 | 
| +  return retval;
 | 
|  }
 | 
|  
 | 
|  // TODO(debargha, peter): Break up large functions into smaller ones
 | 
| @@ -45,14 +59,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|                                                     int bd) {
 | 
|    const __m128i zero = _mm_set1_epi16(0);
 | 
|    const __m128i one = _mm_set1_epi16(1);
 | 
| -  const __m128i blimit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(
 | 
| -          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
 | 
| -  const __m128i limit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
 | 
| -  const __m128i thresh = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(
 | 
| -          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
 | 
| +  __m128i blimit, limit, thresh;
 | 
|    __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
 | 
|    __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
 | 
|    __m128i ps1, qs1, ps0, qs0;
 | 
| @@ -68,6 +75,26 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|    __m128i t4, t3, t80, t1;
 | 
|    __m128i eight, four;
 | 
|  
 | 
| +  if (bd == 8) {
 | 
| +    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
 | 
| +    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
 | 
| +    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
 | 
| +  } else if (bd == 10) {
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
 | 
| +  } else {  // bd == 12
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
 | 
| +  }
 | 
| +
 | 
|    q4 = _mm_load_si128((__m128i *)(s + 4 * p));
 | 
|    p4 = _mm_load_si128((__m128i *)(s - 5 * p));
 | 
|    q3 = _mm_load_si128((__m128i *)(s + 3 * p));
 | 
| @@ -121,7 +148,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|    // highbd_filter4
 | 
|    t4 = _mm_set1_epi16(4);
 | 
|    t3 = _mm_set1_epi16(3);
 | 
| -  t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
 | 
| +  if (bd == 8)
 | 
| +    t80 = _mm_set1_epi16(0x80);
 | 
| +  else if (bd == 10)
 | 
| +    t80 = _mm_set1_epi16(0x200);
 | 
| +  else  // bd == 12
 | 
| +    t80 = _mm_set1_epi16(0x800);
 | 
| +
 | 
|    t1 = _mm_set1_epi16(0x1);
 | 
|  
 | 
|    ps1 = _mm_subs_epi16(p1, t80);
 | 
| @@ -136,7 +169,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|    filt = _mm_adds_epi16(filt, work_a);
 | 
|    filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
 | 
|    filt = _mm_and_si128(filt, mask);
 | 
| -
 | 
|    filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
 | 
|    filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
 | 
|  
 | 
| @@ -153,13 +185,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|    filt = _mm_adds_epi16(filter1, t1);
 | 
|    filt = _mm_srai_epi16(filt, 1);
 | 
|    filt = _mm_andnot_si128(hev, filt);
 | 
| -
 | 
|    qs1 = _mm_adds_epi16(
 | 
|        signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
 | 
|        t80);
 | 
|    ps1 = _mm_adds_epi16(
 | 
|        signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
 | 
|        t80);
 | 
| +
 | 
|    // end highbd_filter4
 | 
|    // loopfilter done
 | 
|  
 | 
| @@ -175,7 +207,14 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|    flat = _mm_max_epi16(work, flat);
 | 
|    work = _mm_max_epi16(abs_p1p0, abs_q1q0);
 | 
|    flat = _mm_max_epi16(work, flat);
 | 
| -  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 | 
| +
 | 
| +  if (bd == 8)
 | 
| +    flat = _mm_subs_epu16(flat, one);
 | 
| +  else if (bd == 10)
 | 
| +    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
 | 
| +  else  // bd == 12
 | 
| +    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
 | 
| +
 | 
|    flat = _mm_cmpeq_epi16(flat, zero);
 | 
|    // end flat_mask4
 | 
|  
 | 
| @@ -215,7 +254,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
 | 
|                                      _mm_subs_epu16(q0, q7)));
 | 
|    flat2 = _mm_max_epi16(work, flat2);
 | 
|  
 | 
| -  flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
 | 
| +  if (bd == 8)
 | 
| +    flat2 = _mm_subs_epu16(flat2, one);
 | 
| +  else if (bd == 10)
 | 
| +    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
 | 
| +  else  // bd == 12
 | 
| +    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
 | 
| +
 | 
|    flat2 = _mm_cmpeq_epi16(flat2, zero);
 | 
|    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
 | 
|    // end highbd_flat_mask5
 | 
| @@ -486,15 +531,7 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
 | 
|    DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
 | 
|    DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
 | 
|    const __m128i zero = _mm_set1_epi16(0);
 | 
| -  const __m128i blimit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
 | 
| -      bd - 8);
 | 
| -  const __m128i limit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
 | 
| -      bd - 8);
 | 
| -  const __m128i thresh = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
 | 
| -      bd - 8);
 | 
| +  __m128i blimit, limit, thresh;
 | 
|    __m128i mask, hev, flat;
 | 
|    __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
 | 
|    __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
 | 
| @@ -512,18 +549,43 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
 | 
|  
 | 
|    const __m128i t4 = _mm_set1_epi16(4);
 | 
|    const __m128i t3 = _mm_set1_epi16(3);
 | 
| -  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
 | 
| +  __m128i t80;
 | 
|    const __m128i t1 = _mm_set1_epi16(0x1);
 | 
| -  const __m128i ps1 = _mm_subs_epi16(p1, t80);
 | 
| -  const __m128i ps0 = _mm_subs_epi16(p0, t80);
 | 
| -  const __m128i qs0 = _mm_subs_epi16(q0, t80);
 | 
| -  const __m128i qs1 = _mm_subs_epi16(q1, t80);
 | 
| +  __m128i ps1, ps0, qs0, qs1;
 | 
|    __m128i filt;
 | 
|    __m128i work_a;
 | 
|    __m128i filter1, filter2;
 | 
|  
 | 
|    (void)count;
 | 
|  
 | 
| +  if (bd == 8) {
 | 
| +    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
 | 
| +    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
 | 
| +    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
 | 
| +    t80 = _mm_set1_epi16(0x80);
 | 
| +  } else if (bd == 10) {
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
 | 
| +    t80 = _mm_set1_epi16(0x200);
 | 
| +  } else {  // bd == 12
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
 | 
| +    t80 = _mm_set1_epi16(0x800);
 | 
| +  }
 | 
| +
 | 
| +  ps1 = _mm_subs_epi16(p1, t80);
 | 
| +  ps0 = _mm_subs_epi16(p0, t80);
 | 
| +  qs0 = _mm_subs_epi16(q0, t80);
 | 
| +  qs1 = _mm_subs_epi16(q1, t80);
 | 
| +
 | 
|    // filter_mask and hev_mask
 | 
|    abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
 | 
|                            _mm_subs_epu16(p0, p1));
 | 
| @@ -575,7 +637,14 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
 | 
|    flat = _mm_max_epi16(work, flat);
 | 
|    flat = _mm_max_epi16(abs_p1p0, flat);
 | 
|    flat = _mm_max_epi16(abs_q1q0, flat);
 | 
| -  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 | 
| +
 | 
| +  if (bd == 8)
 | 
| +    flat = _mm_subs_epu16(flat, one);
 | 
| +  else if (bd == 10)
 | 
| +    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
 | 
| +  else  // bd == 12
 | 
| +    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
 | 
| +
 | 
|    flat = _mm_cmpeq_epi16(flat, zero);
 | 
|    flat = _mm_and_si128(flat, mask);  // flat & mask
 | 
|  
 | 
| @@ -706,15 +775,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
 | 
|                                        const uint8_t *_thresh,
 | 
|                                        int count, int bd) {
 | 
|    const __m128i zero = _mm_set1_epi16(0);
 | 
| -  const __m128i blimit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(
 | 
| -          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
 | 
| -  const __m128i limit = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(
 | 
| -          _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
 | 
| -  const __m128i thresh = _mm_slli_epi16(
 | 
| -      _mm_unpacklo_epi8(
 | 
| -          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
 | 
| +  __m128i blimit, limit, thresh;
 | 
|    __m128i mask, hev, flat;
 | 
|    __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
 | 
|    __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
 | 
| @@ -737,30 +798,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
 | 
|    __m128i work;
 | 
|    const __m128i t4 = _mm_set1_epi16(4);
 | 
|    const __m128i t3 = _mm_set1_epi16(3);
 | 
| -  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
 | 
| -  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
 | 
| -  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
 | 
| -  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
 | 
| +  __m128i t80;
 | 
| +  __m128i tff80;
 | 
| +  __m128i tffe0;
 | 
| +  __m128i t1f;
 | 
|    // equivalent to shifting 0x1f left by bitdepth - 8
 | 
|    // and setting new bits to 1
 | 
|    const __m128i t1 = _mm_set1_epi16(0x1);
 | 
| -  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
 | 
| +  __m128i t7f;
 | 
|    // equivalent to shifting 0x7f left by bitdepth - 8
 | 
|    // and setting new bits to 1
 | 
| -  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
 | 
| -                                     t80);
 | 
| -  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
 | 
| -                                     t80);
 | 
| -  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
 | 
| -                                     t80);
 | 
| -  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
 | 
| -                                     t80);
 | 
| +  __m128i ps1, ps0, qs0, qs1;
 | 
|    __m128i filt;
 | 
|    __m128i work_a;
 | 
|    __m128i filter1, filter2;
 | 
|  
 | 
|    (void)count;
 | 
|  
 | 
| +  if (bd == 8) {
 | 
| +    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
 | 
| +    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
 | 
| +    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
 | 
| +    t80 = _mm_set1_epi16(0x80);
 | 
| +    tff80 = _mm_set1_epi16(0xff80);
 | 
| +    tffe0 = _mm_set1_epi16(0xffe0);
 | 
| +    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
 | 
| +    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
 | 
| +  } else if (bd == 10) {
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
 | 
| +    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
 | 
| +    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
 | 
| +    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
 | 
| +    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
 | 
| +    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
 | 
| +  } else {  // bd == 12
 | 
| +    blimit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
 | 
| +    limit = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
 | 
| +    thresh = _mm_slli_epi16(
 | 
| +        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
 | 
| +    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
 | 
| +    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
 | 
| +    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
 | 
| +    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
 | 
| +    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
 | 
| +  }
 | 
| +
 | 
| +  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
 | 
| +  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
 | 
| +  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
 | 
| +  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
 | 
| +
 | 
|    // filter_mask and hev_mask
 | 
|    flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
 | 
|    hev = _mm_subs_epu16(flat, thresh);
 | 
| @@ -796,6 +890,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
 | 
|    filt = _mm_adds_epi16(filt, work_a);
 | 
|    filt = _mm_adds_epi16(filt, work_a);
 | 
|    filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
 | 
| +
 | 
|    // (vp9_filter + 3 * (qs0 - ps0)) & mask
 | 
|    filt = _mm_and_si128(filt, mask);
 | 
|  
 | 
| 
 |