| Index: source/libvpx/vp9/encoder/vp9_dct.c
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/encoder/vp9_dct.c	(revision 240950)
 | 
| +++ source/libvpx/vp9/encoder/vp9_dct.c	(working copy)
 | 
| @@ -20,6 +20,12 @@
 | 
|  
 | 
|  #include "vp9/encoder/vp9_dct.h"
 | 
|  
 | 
| +static INLINE int fdct_round_shift(int input) {
 | 
| +  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
 | 
| +  assert(INT16_MIN <= rv && rv <= INT16_MAX);
 | 
| +  return rv;
 | 
| +}
 | 
| +
 | 
|  static void fdct4(const int16_t *input, int16_t *output) {
 | 
|    int16_t step[4];
 | 
|    int temp1, temp2;
 | 
| @@ -31,12 +37,12 @@
 | 
|  
 | 
|    temp1 = (step[0] + step[1]) * cospi_16_64;
 | 
|    temp2 = (step[0] - step[1]) * cospi_16_64;
 | 
| -  output[0] = dct_const_round_shift(temp1);
 | 
| -  output[2] = dct_const_round_shift(temp2);
 | 
| +  output[0] = fdct_round_shift(temp1);
 | 
| +  output[2] = fdct_round_shift(temp2);
 | 
|    temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
 | 
|    temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
 | 
| -  output[1] = dct_const_round_shift(temp1);
 | 
| -  output[3] = dct_const_round_shift(temp2);
 | 
| +  output[1] = fdct_round_shift(temp1);
 | 
| +  output[3] = fdct_round_shift(temp2);
 | 
|  }
 | 
|  
 | 
|  void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
 | 
| @@ -80,12 +86,12 @@
 | 
|        step[3] = input[0] - input[3];
 | 
|        temp1 = (step[0] + step[1]) * cospi_16_64;
 | 
|        temp2 = (step[0] - step[1]) * cospi_16_64;
 | 
| -      out[0] = dct_const_round_shift(temp1);
 | 
| -      out[2] = dct_const_round_shift(temp2);
 | 
| +      out[0] = fdct_round_shift(temp1);
 | 
| +      out[2] = fdct_round_shift(temp2);
 | 
|        temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
 | 
|        temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
 | 
| -      out[1] = dct_const_round_shift(temp1);
 | 
| -      out[3] = dct_const_round_shift(temp2);
 | 
| +      out[1] = fdct_round_shift(temp1);
 | 
| +      out[3] = fdct_round_shift(temp2);
 | 
|        // Do next column (which is a transposed row in second/horizontal pass)
 | 
|        in++;
 | 
|        out += 4;
 | 
| @@ -138,10 +144,10 @@
 | 
|    s3 = x2 - x0 + x3;
 | 
|  
 | 
|    // 1-D transform scaling factor is sqrt(2).
 | 
| -  output[0] = dct_const_round_shift(s0);
 | 
| -  output[1] = dct_const_round_shift(s1);
 | 
| -  output[2] = dct_const_round_shift(s2);
 | 
| -  output[3] = dct_const_round_shift(s3);
 | 
| +  output[0] = fdct_round_shift(s0);
 | 
| +  output[1] = fdct_round_shift(s1);
 | 
| +  output[2] = fdct_round_shift(s2);
 | 
| +  output[3] = fdct_round_shift(s3);
 | 
|  }
 | 
|  
 | 
|  static const transform_2d FHT_4[] = {
 | 
| @@ -204,16 +210,16 @@
 | 
|    t1 = (x0 - x1) * cospi_16_64;
 | 
|    t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
 | 
|    t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
 | 
| -  output[0] = dct_const_round_shift(t0);
 | 
| -  output[2] = dct_const_round_shift(t2);
 | 
| -  output[4] = dct_const_round_shift(t1);
 | 
| -  output[6] = dct_const_round_shift(t3);
 | 
| +  output[0] = fdct_round_shift(t0);
 | 
| +  output[2] = fdct_round_shift(t2);
 | 
| +  output[4] = fdct_round_shift(t1);
 | 
| +  output[6] = fdct_round_shift(t3);
 | 
|  
 | 
|    // Stage 2
 | 
|    t0 = (s6 - s5) * cospi_16_64;
 | 
|    t1 = (s6 + s5) * cospi_16_64;
 | 
| -  t2 = dct_const_round_shift(t0);
 | 
| -  t3 = dct_const_round_shift(t1);
 | 
| +  t2 = fdct_round_shift(t0);
 | 
| +  t3 = fdct_round_shift(t1);
 | 
|  
 | 
|    // Stage 3
 | 
|    x0 = s4 + t2;
 | 
| @@ -226,10 +232,10 @@
 | 
|    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
 | 
|    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
 | 
|    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
 | 
| -  output[1] = dct_const_round_shift(t0);
 | 
| -  output[3] = dct_const_round_shift(t2);
 | 
| -  output[5] = dct_const_round_shift(t1);
 | 
| -  output[7] = dct_const_round_shift(t3);
 | 
| +  output[1] = fdct_round_shift(t0);
 | 
| +  output[3] = fdct_round_shift(t2);
 | 
| +  output[5] = fdct_round_shift(t1);
 | 
| +  output[7] = fdct_round_shift(t3);
 | 
|  }
 | 
|  
 | 
|  void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
 | 
| @@ -264,16 +270,16 @@
 | 
|        t1 = (x0 - x1) * cospi_16_64;
 | 
|        t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
 | 
|        t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
 | 
| -      output[0 * 8] = dct_const_round_shift(t0);
 | 
| -      output[2 * 8] = dct_const_round_shift(t2);
 | 
| -      output[4 * 8] = dct_const_round_shift(t1);
 | 
| -      output[6 * 8] = dct_const_round_shift(t3);
 | 
| +      output[0 * 8] = fdct_round_shift(t0);
 | 
| +      output[2 * 8] = fdct_round_shift(t2);
 | 
| +      output[4 * 8] = fdct_round_shift(t1);
 | 
| +      output[6 * 8] = fdct_round_shift(t3);
 | 
|  
 | 
|        // Stage 2
 | 
|        t0 = (s6 - s5) * cospi_16_64;
 | 
|        t1 = (s6 + s5) * cospi_16_64;
 | 
| -      t2 = dct_const_round_shift(t0);
 | 
| -      t3 = dct_const_round_shift(t1);
 | 
| +      t2 = fdct_round_shift(t0);
 | 
| +      t3 = fdct_round_shift(t1);
 | 
|  
 | 
|        // Stage 3
 | 
|        x0 = s4 + t2;
 | 
| @@ -286,10 +292,10 @@
 | 
|        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
 | 
|        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
 | 
|        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
 | 
| -      output[1 * 8] = dct_const_round_shift(t0);
 | 
| -      output[3 * 8] = dct_const_round_shift(t2);
 | 
| -      output[5 * 8] = dct_const_round_shift(t1);
 | 
| -      output[7 * 8] = dct_const_round_shift(t3);
 | 
| +      output[1 * 8] = fdct_round_shift(t0);
 | 
| +      output[3 * 8] = fdct_round_shift(t2);
 | 
| +      output[5 * 8] = fdct_round_shift(t1);
 | 
| +      output[7 * 8] = fdct_round_shift(t3);
 | 
|        input++;
 | 
|        output++;
 | 
|      }
 | 
| @@ -388,16 +394,16 @@
 | 
|          t1 = (x0 - x1) * cospi_16_64;
 | 
|          t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
 | 
|          t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
 | 
| -        out[0] = dct_const_round_shift(t0);
 | 
| -        out[4] = dct_const_round_shift(t2);
 | 
| -        out[8] = dct_const_round_shift(t1);
 | 
| -        out[12] = dct_const_round_shift(t3);
 | 
| +        out[0] = fdct_round_shift(t0);
 | 
| +        out[4] = fdct_round_shift(t2);
 | 
| +        out[8] = fdct_round_shift(t1);
 | 
| +        out[12] = fdct_round_shift(t3);
 | 
|  
 | 
|          // Stage 2
 | 
|          t0 = (s6 - s5) * cospi_16_64;
 | 
|          t1 = (s6 + s5) * cospi_16_64;
 | 
| -        t2 = dct_const_round_shift(t0);
 | 
| -        t3 = dct_const_round_shift(t1);
 | 
| +        t2 = fdct_round_shift(t0);
 | 
| +        t3 = fdct_round_shift(t1);
 | 
|  
 | 
|          // Stage 3
 | 
|          x0 = s4 + t2;
 | 
| @@ -410,22 +416,22 @@
 | 
|          t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
 | 
|          t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
 | 
|          t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
 | 
| -        out[2] = dct_const_round_shift(t0);
 | 
| -        out[6] = dct_const_round_shift(t2);
 | 
| -        out[10] = dct_const_round_shift(t1);
 | 
| -        out[14] = dct_const_round_shift(t3);
 | 
| +        out[2] = fdct_round_shift(t0);
 | 
| +        out[6] = fdct_round_shift(t2);
 | 
| +        out[10] = fdct_round_shift(t1);
 | 
| +        out[14] = fdct_round_shift(t3);
 | 
|        }
 | 
|        // Work on the next eight values; step1 -> odd_results
 | 
|        {
 | 
|          // step 2
 | 
|          temp1 = (step1[5] - step1[2]) * cospi_16_64;
 | 
|          temp2 = (step1[4] - step1[3]) * cospi_16_64;
 | 
| -        step2[2] = dct_const_round_shift(temp1);
 | 
| -        step2[3] = dct_const_round_shift(temp2);
 | 
| +        step2[2] = fdct_round_shift(temp1);
 | 
| +        step2[3] = fdct_round_shift(temp2);
 | 
|          temp1 = (step1[4] + step1[3]) * cospi_16_64;
 | 
|          temp2 = (step1[5] + step1[2]) * cospi_16_64;
 | 
| -        step2[4] = dct_const_round_shift(temp1);
 | 
| -        step2[5] = dct_const_round_shift(temp2);
 | 
| +        step2[4] = fdct_round_shift(temp1);
 | 
| +        step2[5] = fdct_round_shift(temp2);
 | 
|          // step 3
 | 
|          step3[0] = step1[0] + step2[3];
 | 
|          step3[1] = step1[1] + step2[2];
 | 
| @@ -438,12 +444,12 @@
 | 
|          // step 4
 | 
|          temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
 | 
|          temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
 | 
| -        step2[1] = dct_const_round_shift(temp1);
 | 
| -        step2[2] = dct_const_round_shift(temp2);
 | 
| +        step2[1] = fdct_round_shift(temp1);
 | 
| +        step2[2] = fdct_round_shift(temp2);
 | 
|          temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
 | 
|          temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
 | 
| -        step2[5] = dct_const_round_shift(temp1);
 | 
| -        step2[6] = dct_const_round_shift(temp2);
 | 
| +        step2[5] = fdct_round_shift(temp1);
 | 
| +        step2[6] = fdct_round_shift(temp2);
 | 
|          // step 5
 | 
|          step1[0] = step3[0] + step2[1];
 | 
|          step1[1] = step3[0] - step2[1];
 | 
| @@ -456,20 +462,20 @@
 | 
|          // step 6
 | 
|          temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
 | 
|          temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
 | 
| -        out[1] = dct_const_round_shift(temp1);
 | 
| -        out[9] = dct_const_round_shift(temp2);
 | 
| +        out[1] = fdct_round_shift(temp1);
 | 
| +        out[9] = fdct_round_shift(temp2);
 | 
|          temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
 | 
|          temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
 | 
| -        out[5] = dct_const_round_shift(temp1);
 | 
| -        out[13] = dct_const_round_shift(temp2);
 | 
| +        out[5] = fdct_round_shift(temp1);
 | 
| +        out[13] = fdct_round_shift(temp2);
 | 
|          temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
 | 
|          temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
 | 
| -        out[3] = dct_const_round_shift(temp1);
 | 
| -        out[11] = dct_const_round_shift(temp2);
 | 
| +        out[3] = fdct_round_shift(temp1);
 | 
| +        out[11] = fdct_round_shift(temp2);
 | 
|          temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
 | 
|          temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
 | 
| -        out[7] = dct_const_round_shift(temp1);
 | 
| -        out[15] = dct_const_round_shift(temp2);
 | 
| +        out[7] = fdct_round_shift(temp1);
 | 
| +        out[15] = fdct_round_shift(temp2);
 | 
|        }
 | 
|        // Do next column (which is a transposed row in second/horizontal pass)
 | 
|        in++;
 | 
| @@ -503,14 +509,14 @@
 | 
|    s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
 | 
|    s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 | 
|  
 | 
| -  x0 = dct_const_round_shift(s0 + s4);
 | 
| -  x1 = dct_const_round_shift(s1 + s5);
 | 
| -  x2 = dct_const_round_shift(s2 + s6);
 | 
| -  x3 = dct_const_round_shift(s3 + s7);
 | 
| -  x4 = dct_const_round_shift(s0 - s4);
 | 
| -  x5 = dct_const_round_shift(s1 - s5);
 | 
| -  x6 = dct_const_round_shift(s2 - s6);
 | 
| -  x7 = dct_const_round_shift(s3 - s7);
 | 
| +  x0 = fdct_round_shift(s0 + s4);
 | 
| +  x1 = fdct_round_shift(s1 + s5);
 | 
| +  x2 = fdct_round_shift(s2 + s6);
 | 
| +  x3 = fdct_round_shift(s3 + s7);
 | 
| +  x4 = fdct_round_shift(s0 - s4);
 | 
| +  x5 = fdct_round_shift(s1 - s5);
 | 
| +  x6 = fdct_round_shift(s2 - s6);
 | 
| +  x7 = fdct_round_shift(s3 - s7);
 | 
|  
 | 
|    // stage 2
 | 
|    s0 = x0;
 | 
| @@ -526,10 +532,10 @@
 | 
|    x1 = s1 + s3;
 | 
|    x2 = s0 - s2;
 | 
|    x3 = s1 - s3;
 | 
| -  x4 = dct_const_round_shift(s4 + s6);
 | 
| -  x5 = dct_const_round_shift(s5 + s7);
 | 
| -  x6 = dct_const_round_shift(s4 - s6);
 | 
| -  x7 = dct_const_round_shift(s5 - s7);
 | 
| +  x4 = fdct_round_shift(s4 + s6);
 | 
| +  x5 = fdct_round_shift(s5 + s7);
 | 
| +  x6 = fdct_round_shift(s4 - s6);
 | 
| +  x7 = fdct_round_shift(s5 - s7);
 | 
|  
 | 
|    // stage 3
 | 
|    s2 = cospi_16_64 * (x2 + x3);
 | 
| @@ -537,10 +543,10 @@
 | 
|    s6 = cospi_16_64 * (x6 + x7);
 | 
|    s7 = cospi_16_64 * (x6 - x7);
 | 
|  
 | 
| -  x2 = dct_const_round_shift(s2);
 | 
| -  x3 = dct_const_round_shift(s3);
 | 
| -  x6 = dct_const_round_shift(s6);
 | 
| -  x7 = dct_const_round_shift(s7);
 | 
| +  x2 = fdct_round_shift(s2);
 | 
| +  x3 = fdct_round_shift(s3);
 | 
| +  x6 = fdct_round_shift(s6);
 | 
| +  x7 = fdct_round_shift(s7);
 | 
|  
 | 
|    output[0] =   x0;
 | 
|    output[1] = - x4;
 | 
| @@ -693,16 +699,16 @@
 | 
|      t1 = (x0 - x1) * cospi_16_64;
 | 
|      t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
 | 
|      t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
 | 
| -    out[0] = dct_const_round_shift(t0);
 | 
| -    out[4] = dct_const_round_shift(t2);
 | 
| -    out[8] = dct_const_round_shift(t1);
 | 
| -    out[12] = dct_const_round_shift(t3);
 | 
| +    out[0] = fdct_round_shift(t0);
 | 
| +    out[4] = fdct_round_shift(t2);
 | 
| +    out[8] = fdct_round_shift(t1);
 | 
| +    out[12] = fdct_round_shift(t3);
 | 
|  
 | 
|      // Stage 2
 | 
|      t0 = (s6 - s5) * cospi_16_64;
 | 
|      t1 = (s6 + s5) * cospi_16_64;
 | 
| -    t2 = dct_const_round_shift(t0);
 | 
| -    t3 = dct_const_round_shift(t1);
 | 
| +    t2 = fdct_round_shift(t0);
 | 
| +    t3 = fdct_round_shift(t1);
 | 
|  
 | 
|      // Stage 3
 | 
|      x0 = s4 + t2;
 | 
| @@ -715,21 +721,21 @@
 | 
|      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
 | 
|      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
 | 
|      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
 | 
| -    out[2] = dct_const_round_shift(t0);
 | 
| -    out[6] = dct_const_round_shift(t2);
 | 
| -    out[10] = dct_const_round_shift(t1);
 | 
| -    out[14] = dct_const_round_shift(t3);
 | 
| +    out[2] = fdct_round_shift(t0);
 | 
| +    out[6] = fdct_round_shift(t2);
 | 
| +    out[10] = fdct_round_shift(t1);
 | 
| +    out[14] = fdct_round_shift(t3);
 | 
|    }
 | 
|  
 | 
|    // step 2
 | 
|    temp1 = (step1[5] - step1[2]) * cospi_16_64;
 | 
|    temp2 = (step1[4] - step1[3]) * cospi_16_64;
 | 
| -  step2[2] = dct_const_round_shift(temp1);
 | 
| -  step2[3] = dct_const_round_shift(temp2);
 | 
| +  step2[2] = fdct_round_shift(temp1);
 | 
| +  step2[3] = fdct_round_shift(temp2);
 | 
|    temp1 = (step1[4] + step1[3]) * cospi_16_64;
 | 
|    temp2 = (step1[5] + step1[2]) * cospi_16_64;
 | 
| -  step2[4] = dct_const_round_shift(temp1);
 | 
| -  step2[5] = dct_const_round_shift(temp2);
 | 
| +  step2[4] = fdct_round_shift(temp1);
 | 
| +  step2[5] = fdct_round_shift(temp2);
 | 
|  
 | 
|    // step 3
 | 
|    step3[0] = step1[0] + step2[3];
 | 
| @@ -744,12 +750,12 @@
 | 
|    // step 4
 | 
|    temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
 | 
|    temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
 | 
| -  step2[1] = dct_const_round_shift(temp1);
 | 
| -  step2[2] = dct_const_round_shift(temp2);
 | 
| +  step2[1] = fdct_round_shift(temp1);
 | 
| +  step2[2] = fdct_round_shift(temp2);
 | 
|    temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
 | 
|    temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
 | 
| -  step2[5] = dct_const_round_shift(temp1);
 | 
| -  step2[6] = dct_const_round_shift(temp2);
 | 
| +  step2[5] = fdct_round_shift(temp1);
 | 
| +  step2[6] = fdct_round_shift(temp2);
 | 
|  
 | 
|    // step 5
 | 
|    step1[0] = step3[0] + step2[1];
 | 
| @@ -764,23 +770,23 @@
 | 
|    // step 6
 | 
|    temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
 | 
|    temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
 | 
| -  out[1] = dct_const_round_shift(temp1);
 | 
| -  out[9] = dct_const_round_shift(temp2);
 | 
| +  out[1] = fdct_round_shift(temp1);
 | 
| +  out[9] = fdct_round_shift(temp2);
 | 
|  
 | 
|    temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
 | 
|    temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
 | 
| -  out[5] = dct_const_round_shift(temp1);
 | 
| -  out[13] = dct_const_round_shift(temp2);
 | 
| +  out[5] = fdct_round_shift(temp1);
 | 
| +  out[13] = fdct_round_shift(temp2);
 | 
|  
 | 
|    temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
 | 
|    temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
 | 
| -  out[3] = dct_const_round_shift(temp1);
 | 
| -  out[11] = dct_const_round_shift(temp2);
 | 
| +  out[3] = fdct_round_shift(temp1);
 | 
| +  out[11] = fdct_round_shift(temp2);
 | 
|  
 | 
|    temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
 | 
|    temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
 | 
| -  out[7] = dct_const_round_shift(temp1);
 | 
| -  out[15] = dct_const_round_shift(temp2);
 | 
| +  out[7] = fdct_round_shift(temp1);
 | 
| +  out[15] = fdct_round_shift(temp2);
 | 
|  }
 | 
|  
 | 
|  static void fadst16(const int16_t *input, int16_t *output) {
 | 
| @@ -821,22 +827,22 @@
 | 
|    s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
 | 
|    s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 | 
|  
 | 
| -  x0 = dct_const_round_shift(s0 + s8);
 | 
| -  x1 = dct_const_round_shift(s1 + s9);
 | 
| -  x2 = dct_const_round_shift(s2 + s10);
 | 
| -  x3 = dct_const_round_shift(s3 + s11);
 | 
| -  x4 = dct_const_round_shift(s4 + s12);
 | 
| -  x5 = dct_const_round_shift(s5 + s13);
 | 
| -  x6 = dct_const_round_shift(s6 + s14);
 | 
| -  x7 = dct_const_round_shift(s7 + s15);
 | 
| -  x8  = dct_const_round_shift(s0 - s8);
 | 
| -  x9  = dct_const_round_shift(s1 - s9);
 | 
| -  x10 = dct_const_round_shift(s2 - s10);
 | 
| -  x11 = dct_const_round_shift(s3 - s11);
 | 
| -  x12 = dct_const_round_shift(s4 - s12);
 | 
| -  x13 = dct_const_round_shift(s5 - s13);
 | 
| -  x14 = dct_const_round_shift(s6 - s14);
 | 
| -  x15 = dct_const_round_shift(s7 - s15);
 | 
| +  x0 = fdct_round_shift(s0 + s8);
 | 
| +  x1 = fdct_round_shift(s1 + s9);
 | 
| +  x2 = fdct_round_shift(s2 + s10);
 | 
| +  x3 = fdct_round_shift(s3 + s11);
 | 
| +  x4 = fdct_round_shift(s4 + s12);
 | 
| +  x5 = fdct_round_shift(s5 + s13);
 | 
| +  x6 = fdct_round_shift(s6 + s14);
 | 
| +  x7 = fdct_round_shift(s7 + s15);
 | 
| +  x8  = fdct_round_shift(s0 - s8);
 | 
| +  x9  = fdct_round_shift(s1 - s9);
 | 
| +  x10 = fdct_round_shift(s2 - s10);
 | 
| +  x11 = fdct_round_shift(s3 - s11);
 | 
| +  x12 = fdct_round_shift(s4 - s12);
 | 
| +  x13 = fdct_round_shift(s5 - s13);
 | 
| +  x14 = fdct_round_shift(s6 - s14);
 | 
| +  x15 = fdct_round_shift(s7 - s15);
 | 
|  
 | 
|    // stage 2
 | 
|    s0 = x0;
 | 
| @@ -864,14 +870,14 @@
 | 
|    x5 = s1 - s5;
 | 
|    x6 = s2 - s6;
 | 
|    x7 = s3 - s7;
 | 
| -  x8 = dct_const_round_shift(s8 + s12);
 | 
| -  x9 = dct_const_round_shift(s9 + s13);
 | 
| -  x10 = dct_const_round_shift(s10 + s14);
 | 
| -  x11 = dct_const_round_shift(s11 + s15);
 | 
| -  x12 = dct_const_round_shift(s8 - s12);
 | 
| -  x13 = dct_const_round_shift(s9 - s13);
 | 
| -  x14 = dct_const_round_shift(s10 - s14);
 | 
| -  x15 = dct_const_round_shift(s11 - s15);
 | 
| +  x8 = fdct_round_shift(s8 + s12);
 | 
| +  x9 = fdct_round_shift(s9 + s13);
 | 
| +  x10 = fdct_round_shift(s10 + s14);
 | 
| +  x11 = fdct_round_shift(s11 + s15);
 | 
| +  x12 = fdct_round_shift(s8 - s12);
 | 
| +  x13 = fdct_round_shift(s9 - s13);
 | 
| +  x14 = fdct_round_shift(s10 - s14);
 | 
| +  x15 = fdct_round_shift(s11 - s15);
 | 
|  
 | 
|    // stage 3
 | 
|    s0 = x0;
 | 
| @@ -895,18 +901,18 @@
 | 
|    x1 = s1 + s3;
 | 
|    x2 = s0 - s2;
 | 
|    x3 = s1 - s3;
 | 
| -  x4 = dct_const_round_shift(s4 + s6);
 | 
| -  x5 = dct_const_round_shift(s5 + s7);
 | 
| -  x6 = dct_const_round_shift(s4 - s6);
 | 
| -  x7 = dct_const_round_shift(s5 - s7);
 | 
| +  x4 = fdct_round_shift(s4 + s6);
 | 
| +  x5 = fdct_round_shift(s5 + s7);
 | 
| +  x6 = fdct_round_shift(s4 - s6);
 | 
| +  x7 = fdct_round_shift(s5 - s7);
 | 
|    x8 = s8 + s10;
 | 
|    x9 = s9 + s11;
 | 
|    x10 = s8 - s10;
 | 
|    x11 = s9 - s11;
 | 
| -  x12 = dct_const_round_shift(s12 + s14);
 | 
| -  x13 = dct_const_round_shift(s13 + s15);
 | 
| -  x14 = dct_const_round_shift(s12 - s14);
 | 
| -  x15 = dct_const_round_shift(s13 - s15);
 | 
| +  x12 = fdct_round_shift(s12 + s14);
 | 
| +  x13 = fdct_round_shift(s13 + s15);
 | 
| +  x14 = fdct_round_shift(s12 - s14);
 | 
| +  x15 = fdct_round_shift(s13 - s15);
 | 
|  
 | 
|    // stage 4
 | 
|    s2 = (- cospi_16_64) * (x2 + x3);
 | 
| @@ -918,14 +924,14 @@
 | 
|    s14 = (- cospi_16_64) * (x14 + x15);
 | 
|    s15 = cospi_16_64 * (x14 - x15);
 | 
|  
 | 
| -  x2 = dct_const_round_shift(s2);
 | 
| -  x3 = dct_const_round_shift(s3);
 | 
| -  x6 = dct_const_round_shift(s6);
 | 
| -  x7 = dct_const_round_shift(s7);
 | 
| -  x10 = dct_const_round_shift(s10);
 | 
| -  x11 = dct_const_round_shift(s11);
 | 
| -  x14 = dct_const_round_shift(s14);
 | 
| -  x15 = dct_const_round_shift(s15);
 | 
| +  x2 = fdct_round_shift(s2);
 | 
| +  x3 = fdct_round_shift(s3);
 | 
| +  x6 = fdct_round_shift(s6);
 | 
| +  x7 = fdct_round_shift(s7);
 | 
| +  x10 = fdct_round_shift(s10);
 | 
| +  x11 = fdct_round_shift(s11);
 | 
| +  x14 = fdct_round_shift(s14);
 | 
| +  x15 = fdct_round_shift(s15);
 | 
|  
 | 
|    output[0] = x0;
 | 
|    output[1] = - x8;
 | 
| 
 |