| Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c	(revision 240950)
 | 
| +++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c	(working copy)
 | 
| @@ -26,24 +26,25 @@
 | 
|    //    by constructing the 32 bit constant corresponding to that pair.
 | 
|    const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
 | 
|    const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
 | 
| -  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
 | 
| -  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
 | 
| +  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
 | 
| +  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
 | 
|    const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 | 
|    const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
 | 
|    const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
 | 
|    const __m128i kOne = _mm_set1_epi16(1);
 | 
| -  __m128i in0, in1, in2, in3;
 | 
| +  __m128i in0, in1;
 | 
|    // Load inputs.
 | 
|    {
 | 
|      in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
 | 
| -    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
 | 
| -    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
 | 
| -    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
 | 
| +    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
 | 
| +           (input +  1 * stride)));
 | 
| +    in1  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
 | 
| +    in1  = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)
 | 
| +           (input +  3 * stride)), in1);
 | 
| +
 | 
|      // x = x << 4
 | 
|      in0 = _mm_slli_epi16(in0, 4);
 | 
|      in1 = _mm_slli_epi16(in1, 4);
 | 
| -    in2 = _mm_slli_epi16(in2, 4);
 | 
| -    in3 = _mm_slli_epi16(in3, 4);
 | 
|      // if (i == 0 && input[0]) input[0] += 1;
 | 
|      {
 | 
|        // The mask will only contain wether the first value is zero, all
 | 
| @@ -60,18 +61,18 @@
 | 
|    // Do the two transform/transpose passes
 | 
|    for (pass = 0; pass < 2; ++pass) {
 | 
|      // Transform 1/2: Add/substract
 | 
| -    const __m128i r0 = _mm_add_epi16(in0, in3);
 | 
| -    const __m128i r1 = _mm_add_epi16(in1, in2);
 | 
| -    const __m128i r2 = _mm_sub_epi16(in1, in2);
 | 
| -    const __m128i r3 = _mm_sub_epi16(in0, in3);
 | 
| +    const __m128i r0 = _mm_add_epi16(in0, in1);
 | 
| +    const __m128i r1 = _mm_sub_epi16(in0, in1);
 | 
| +    const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
 | 
| +    const __m128i r3 = _mm_unpackhi_epi64(r0, r1);
 | 
|      // Transform 1/2: Interleave to do the multiply by constants which gets us
 | 
|      //                into 32 bits.
 | 
| -    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
 | 
| -    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
 | 
| +    const __m128i t0 = _mm_unpacklo_epi16(r2, r3);
 | 
| +    const __m128i t2 = _mm_unpackhi_epi16(r2, r3);
 | 
|      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
 | 
|      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
 | 
| -    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
 | 
| -    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
 | 
| +    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p08_p24);
 | 
| +    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_p24_m08);
 | 
|      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
 | 
|      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
 | 
|      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
 | 
| @@ -90,25 +91,22 @@
 | 
|      // 00 10 01 11 02 12 03 13
 | 
|      // 20 30 21 31 22 32 23 33
 | 
|      in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
 | 
| -    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
 | 
| +    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
 | 
| +    in1 = _mm_shuffle_epi32(in1, 0x4E);
 | 
|      // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
 | 
| -    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
 | 
| -    if (0 == pass) {
 | 
| -      // Extract values in the high part for second pass as transform code
 | 
| -      // only uses the first four values.
 | 
| -      in1 = _mm_unpackhi_epi64(in0, in0);
 | 
| -      in3 = _mm_unpackhi_epi64(in2, in2);
 | 
| -    } else {
 | 
| -      // Post-condition output and store it (v + 1) >> 2, taking advantage
 | 
| -      // of the fact 1/3 are stored just after 0/2.
 | 
| -      __m128i out01 = _mm_add_epi16(in0, kOne);
 | 
| -      __m128i out23 = _mm_add_epi16(in2, kOne);
 | 
| -      out01 = _mm_srai_epi16(out01, 2);
 | 
| -      out23 = _mm_srai_epi16(out23, 2);
 | 
| -      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
 | 
| -      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
 | 
| -    }
 | 
| +    // 02 12 22 32 03 13 23 33      in1 contains 2 followed by 3
 | 
|    }
 | 
| +  in1 = _mm_shuffle_epi32(in1, 0x4E);
 | 
| +  // Post-condition output and store it (v + 1) >> 2, taking advantage
 | 
| +  // of the fact 1/3 are stored just after 0/2.
 | 
| +  {
 | 
| +     __m128i out01 = _mm_add_epi16(in0, kOne);
 | 
| +     __m128i out23 = _mm_add_epi16(in1, kOne);
 | 
| +     out01 = _mm_srai_epi16(out01, 2);
 | 
| +     out23 = _mm_srai_epi16(out23, 2);
 | 
| +     _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
 | 
| +     _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
 | 
| +  }
 | 
|  }
 | 
|  
 | 
|  static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
 | 
| @@ -206,12 +204,12 @@
 | 
|    const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 | 
|    __m128i u[8], v[8];
 | 
|    __m128i in7 = _mm_add_epi16(in[0], in[1]);
 | 
| -  in7 = _mm_sub_epi16(in7, in[3]);
 | 
|  
 | 
|    u[0] = _mm_unpacklo_epi16(in[0], in[1]);
 | 
|    u[1] = _mm_unpacklo_epi16(in[2], in[3]);
 | 
|    u[2] = _mm_unpacklo_epi16(in7, kZero);
 | 
|    u[3] = _mm_unpacklo_epi16(in[2], kZero);
 | 
| +  u[4] = _mm_unpacklo_epi16(in[3], kZero);
 | 
|  
 | 
|    v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
 | 
|    v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
 | 
| @@ -219,9 +217,10 @@
 | 
|    v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
 | 
|    v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
 | 
|    v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
 | 
| +  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
 | 
|  
 | 
|    u[0] = _mm_add_epi32(v[0], v[1]);
 | 
| -  u[1] = v[2];
 | 
| +  u[1] = _mm_sub_epi32(v[2], v[6]);
 | 
|    u[2] = _mm_add_epi32(v[3], v[4]);
 | 
|    u[3] = _mm_sub_epi32(u[2], u[0]);
 | 
|    u[4] = _mm_slli_epi32(v[5], 2);
 | 
| 
 |