| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3  * | 3  * | 
| 4  *  Use of this source code is governed by a BSD-style license | 4  *  Use of this source code is governed by a BSD-style license | 
| 5  *  that can be found in the LICENSE file in the root of the source | 5  *  that can be found in the LICENSE file in the root of the source | 
| 6  *  tree. An additional intellectual property rights grant can be found | 6  *  tree. An additional intellectual property rights grant can be found | 
| 7  *  in the file PATENTS.  All contributing project authors may | 7  *  in the file PATENTS.  All contributing project authors may | 
| 8  *  be found in the AUTHORS file in the root of the source tree. | 8  *  be found in the AUTHORS file in the root of the source tree. | 
| 9  */ | 9  */ | 
| 10 | 10 | 
| 11 #include <assert.h> | 11 #include <assert.h> | 
| 12 #include <math.h> | 12 #include <math.h> | 
| 13 | 13 | 
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" | 
| 15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" | 
| 16 | 16 | 
| 17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" | 
| 18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" | 
| 19 #include "vp9/common/vp9_systemdependent.h" | 19 #include "vp9/common/vp9_systemdependent.h" | 
| 20 | 20 | 
| 21 static INLINE int fdct_round_shift(int input) { | 21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) { | 
| 22   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 22   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 
| 23   assert(INT16_MIN <= rv && rv <= INT16_MAX); | 23   // TODO(debargha, peter.derivaz): Find new bounds for this assert | 
|  | 24   // and make the bounds consts. | 
|  | 25   // assert(INT16_MIN <= rv && rv <= INT16_MAX); | 
| 24   return rv; | 26   return rv; | 
| 25 } | 27 } | 
| 26 | 28 | 
| 27 static void fdct4(const int16_t *input, int16_t *output) { | 29 static void fdct4(const tran_low_t *input, tran_low_t *output) { | 
| 28   int16_t step[4]; | 30   tran_high_t step[4]; | 
| 29   int temp1, temp2; | 31   tran_high_t temp1, temp2; | 
| 30 | 32 | 
| 31   step[0] = input[0] + input[3]; | 33   step[0] = input[0] + input[3]; | 
| 32   step[1] = input[1] + input[2]; | 34   step[1] = input[1] + input[2]; | 
| 33   step[2] = input[1] - input[2]; | 35   step[2] = input[1] - input[2]; | 
| 34   step[3] = input[0] - input[3]; | 36   step[3] = input[0] - input[3]; | 
| 35 | 37 | 
| 36   temp1 = (step[0] + step[1]) * cospi_16_64; | 38   temp1 = (step[0] + step[1]) * cospi_16_64; | 
| 37   temp2 = (step[0] - step[1]) * cospi_16_64; | 39   temp2 = (step[0] - step[1]) * cospi_16_64; | 
| 38   output[0] = fdct_round_shift(temp1); | 40   output[0] = fdct_round_shift(temp1); | 
| 39   output[2] = fdct_round_shift(temp2); | 41   output[2] = fdct_round_shift(temp2); | 
| 40   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 42   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 
| 41   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 43   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 
| 42   output[1] = fdct_round_shift(temp1); | 44   output[1] = fdct_round_shift(temp1); | 
| 43   output[3] = fdct_round_shift(temp2); | 45   output[3] = fdct_round_shift(temp2); | 
| 44 } | 46 } | 
| 45 | 47 | 
| 46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { | 48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 47   int r, c; | 49   int r, c; | 
| 48   int16_t sum = 0; | 50   tran_low_t sum = 0; | 
| 49   for (r = 0; r < 4; ++r) | 51   for (r = 0; r < 4; ++r) | 
| 50     for (c = 0; c < 4; ++c) | 52     for (c = 0; c < 4; ++c) | 
| 51       sum += input[r * stride + c]; | 53       sum += input[r * stride + c]; | 
| 52 | 54 | 
| 53   output[0] = sum << 1; | 55   output[0] = sum << 1; | 
| 54   output[1] = 0; | 56   output[1] = 0; | 
| 55 } | 57 } | 
| 56 | 58 | 
| 57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { | 59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 58   // The 2D transform is done with two passes which are actually pretty | 60   // The 2D transform is done with two passes which are actually pretty | 
| 59   // similar. In the first one, we transform the columns and transpose | 61   // similar. In the first one, we transform the columns and transpose | 
| 60   // the results. In the second one, we transform the rows. To achieve that, | 62   // the results. In the second one, we transform the rows. To achieve that, | 
| 61   // as the first pass results are transposed, we transpose the columns (that | 63   // as the first pass results are transposed, we transpose the columns (that | 
| 62   // is the transposed rows) and transpose the results (so that it goes back | 64   // is the transposed rows) and transpose the results (so that it goes back | 
| 63   // in normal/row positions). | 65   // in normal/row positions). | 
| 64   int pass; | 66   int pass; | 
| 65   // We need an intermediate buffer between passes. | 67   // We need an intermediate buffer between passes. | 
| 66   int16_t intermediate[4 * 4]; | 68   tran_low_t intermediate[4 * 4]; | 
| 67   const int16_t *in = input; | 69   const int16_t *in_pass0 = input; | 
| 68   int16_t *out = intermediate; | 70   const tran_low_t *in = NULL; | 
|  | 71   tran_low_t *out = intermediate; | 
| 69   // Do the two transform/transpose passes | 72   // Do the two transform/transpose passes | 
| 70   for (pass = 0; pass < 2; ++pass) { | 73   for (pass = 0; pass < 2; ++pass) { | 
| 71     /*canbe16*/ int input[4]; | 74     tran_high_t input[4];      // canbe16 | 
| 72     /*canbe16*/ int step[4]; | 75     tran_high_t step[4];       // canbe16 | 
| 73     /*needs32*/ int temp1, temp2; | 76     tran_high_t temp1, temp2;  // needs32 | 
| 74     int i; | 77     int i; | 
| 75     for (i = 0; i < 4; ++i) { | 78     for (i = 0; i < 4; ++i) { | 
| 76       // Load inputs. | 79       // Load inputs. | 
| 77       if (0 == pass) { | 80       if (0 == pass) { | 
| 78         input[0] = in[0 * stride] * 16; | 81         input[0] = in_pass0[0 * stride] * 16; | 
| 79         input[1] = in[1 * stride] * 16; | 82         input[1] = in_pass0[1 * stride] * 16; | 
| 80         input[2] = in[2 * stride] * 16; | 83         input[2] = in_pass0[2 * stride] * 16; | 
| 81         input[3] = in[3 * stride] * 16; | 84         input[3] = in_pass0[3 * stride] * 16; | 
| 82         if (i == 0 && input[0]) { | 85         if (i == 0 && input[0]) { | 
| 83           input[0] += 1; | 86           input[0] += 1; | 
| 84         } | 87         } | 
| 85       } else { | 88       } else { | 
| 86         input[0] = in[0 * 4]; | 89         input[0] = in[0 * 4]; | 
| 87         input[1] = in[1 * 4]; | 90         input[1] = in[1 * 4]; | 
| 88         input[2] = in[2 * 4]; | 91         input[2] = in[2 * 4]; | 
| 89         input[3] = in[3 * 4]; | 92         input[3] = in[3 * 4]; | 
| 90       } | 93       } | 
| 91       // Transform. | 94       // Transform. | 
| 92       step[0] = input[0] + input[3]; | 95       step[0] = input[0] + input[3]; | 
| 93       step[1] = input[1] + input[2]; | 96       step[1] = input[1] + input[2]; | 
| 94       step[2] = input[1] - input[2]; | 97       step[2] = input[1] - input[2]; | 
| 95       step[3] = input[0] - input[3]; | 98       step[3] = input[0] - input[3]; | 
| 96       temp1 = (step[0] + step[1]) * cospi_16_64; | 99       temp1 = (step[0] + step[1]) * cospi_16_64; | 
| 97       temp2 = (step[0] - step[1]) * cospi_16_64; | 100       temp2 = (step[0] - step[1]) * cospi_16_64; | 
| 98       out[0] = fdct_round_shift(temp1); | 101       out[0] = fdct_round_shift(temp1); | 
| 99       out[2] = fdct_round_shift(temp2); | 102       out[2] = fdct_round_shift(temp2); | 
| 100       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 103       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 
| 101       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 104       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 
| 102       out[1] = fdct_round_shift(temp1); | 105       out[1] = fdct_round_shift(temp1); | 
| 103       out[3] = fdct_round_shift(temp2); | 106       out[3] = fdct_round_shift(temp2); | 
| 104       // Do next column (which is a transposed row in second/horizontal pass) | 107       // Do next column (which is a transposed row in second/horizontal pass) | 
|  | 108       in_pass0++; | 
| 105       in++; | 109       in++; | 
| 106       out += 4; | 110       out += 4; | 
| 107     } | 111     } | 
| 108     // Setup in/out for next pass. | 112     // Setup in/out for next pass. | 
| 109     in = intermediate; | 113     in = intermediate; | 
| 110     out = output; | 114     out = output; | 
| 111   } | 115   } | 
| 112 | 116 | 
| 113   { | 117   { | 
| 114     int i, j; | 118     int i, j; | 
| 115     for (i = 0; i < 4; ++i) { | 119     for (i = 0; i < 4; ++i) { | 
| 116       for (j = 0; j < 4; ++j) | 120       for (j = 0; j < 4; ++j) | 
| 117         output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 121         output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 
| 118     } | 122     } | 
| 119   } | 123   } | 
| 120 } | 124 } | 
| 121 | 125 | 
| 122 static void fadst4(const int16_t *input, int16_t *output) { | 126 static void fadst4(const tran_low_t *input, tran_low_t *output) { | 
| 123   int x0, x1, x2, x3; | 127   tran_high_t x0, x1, x2, x3; | 
| 124   int s0, s1, s2, s3, s4, s5, s6, s7; | 128   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; | 
| 125 | 129 | 
| 126   x0 = input[0]; | 130   x0 = input[0]; | 
| 127   x1 = input[1]; | 131   x1 = input[1]; | 
| 128   x2 = input[2]; | 132   x2 = input[2]; | 
| 129   x3 = input[3]; | 133   x3 = input[3]; | 
| 130 | 134 | 
| 131   if (!(x0 | x1 | x2 | x3)) { | 135   if (!(x0 | x1 | x2 | x3)) { | 
| 132     output[0] = output[1] = output[2] = output[3] = 0; | 136     output[0] = output[1] = output[2] = output[3] = 0; | 
| 133     return; | 137     return; | 
| 134   } | 138   } | 
| (...skipping 24 matching lines...) Expand all  Loading... | 
| 159   output[3] = fdct_round_shift(s3); | 163   output[3] = fdct_round_shift(s3); | 
| 160 } | 164 } | 
| 161 | 165 | 
| 162 static const transform_2d FHT_4[] = { | 166 static const transform_2d FHT_4[] = { | 
| 163   { fdct4,  fdct4  },  // DCT_DCT  = 0 | 167   { fdct4,  fdct4  },  // DCT_DCT  = 0 | 
| 164   { fadst4, fdct4  },  // ADST_DCT = 1 | 168   { fadst4, fdct4  },  // ADST_DCT = 1 | 
| 165   { fdct4,  fadst4 },  // DCT_ADST = 2 | 169   { fdct4,  fadst4 },  // DCT_ADST = 2 | 
| 166   { fadst4, fadst4 }   // ADST_ADST = 3 | 170   { fadst4, fadst4 }   // ADST_ADST = 3 | 
| 167 }; | 171 }; | 
| 168 | 172 | 
| 169 void vp9_fht4x4_c(const int16_t *input, int16_t *output, | 173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, | 
| 170                   int stride, int tx_type) { | 174                   int stride, int tx_type) { | 
| 171   if (tx_type == DCT_DCT) { | 175   if (tx_type == DCT_DCT) { | 
| 172     vp9_fdct4x4_c(input, output, stride); | 176     vp9_fdct4x4_c(input, output, stride); | 
| 173   } else { | 177   } else { | 
| 174     int16_t out[4 * 4]; | 178     tran_low_t out[4 * 4]; | 
| 175     int16_t *outptr = &out[0]; | 179     tran_low_t *outptr = &out[0]; | 
| 176     int i, j; | 180     int i, j; | 
| 177     int16_t temp_in[4], temp_out[4]; | 181     tran_low_t temp_in[4], temp_out[4]; | 
| 178     const transform_2d ht = FHT_4[tx_type]; | 182     const transform_2d ht = FHT_4[tx_type]; | 
| 179 | 183 | 
| 180     // Columns | 184     // Columns | 
| 181     for (i = 0; i < 4; ++i) { | 185     for (i = 0; i < 4; ++i) { | 
| 182       for (j = 0; j < 4; ++j) | 186       for (j = 0; j < 4; ++j) | 
| 183         temp_in[j] = input[j * stride + i] * 16; | 187         temp_in[j] = input[j * stride + i] * 16; | 
| 184       if (i == 0 && temp_in[0]) | 188       if (i == 0 && temp_in[0]) | 
| 185         temp_in[0] += 1; | 189         temp_in[0] += 1; | 
| 186       ht.cols(temp_in, temp_out); | 190       ht.cols(temp_in, temp_out); | 
| 187       for (j = 0; j < 4; ++j) | 191       for (j = 0; j < 4; ++j) | 
| 188         outptr[j * 4 + i] = temp_out[j]; | 192         outptr[j * 4 + i] = temp_out[j]; | 
| 189     } | 193     } | 
| 190 | 194 | 
| 191     // Rows | 195     // Rows | 
| 192     for (i = 0; i < 4; ++i) { | 196     for (i = 0; i < 4; ++i) { | 
| 193       for (j = 0; j < 4; ++j) | 197       for (j = 0; j < 4; ++j) | 
| 194         temp_in[j] = out[j + i * 4]; | 198         temp_in[j] = out[j + i * 4]; | 
| 195       ht.rows(temp_in, temp_out); | 199       ht.rows(temp_in, temp_out); | 
| 196       for (j = 0; j < 4; ++j) | 200       for (j = 0; j < 4; ++j) | 
| 197         output[j + i * 4] = (temp_out[j] + 1) >> 2; | 201         output[j + i * 4] = (temp_out[j] + 1) >> 2; | 
| 198     } | 202     } | 
| 199   } | 203   } | 
| 200 } | 204 } | 
| 201 | 205 | 
| 202 static void fdct8(const int16_t *input, int16_t *output) { | 206 static void fdct8(const tran_low_t *input, tran_low_t *output) { | 
| 203   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 207   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16 | 
| 204   /*needs32*/ int t0, t1, t2, t3; | 208   tran_high_t t0, t1, t2, t3;                  // needs32 | 
| 205   /*canbe16*/ int x0, x1, x2, x3; | 209   tran_high_t x0, x1, x2, x3;                  // canbe16 | 
| 206 | 210 | 
| 207   // stage 1 | 211   // stage 1 | 
| 208   s0 = input[0] + input[7]; | 212   s0 = input[0] + input[7]; | 
| 209   s1 = input[1] + input[6]; | 213   s1 = input[1] + input[6]; | 
| 210   s2 = input[2] + input[5]; | 214   s2 = input[2] + input[5]; | 
| 211   s3 = input[3] + input[4]; | 215   s3 = input[3] + input[4]; | 
| 212   s4 = input[3] - input[4]; | 216   s4 = input[3] - input[4]; | 
| 213   s5 = input[2] - input[5]; | 217   s5 = input[2] - input[5]; | 
| 214   s6 = input[1] - input[6]; | 218   s6 = input[1] - input[6]; | 
| 215   s7 = input[0] - input[7]; | 219   s7 = input[0] - input[7]; | 
| (...skipping 28 matching lines...) Expand all  Loading... | 
| 244   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64; | 248   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64; | 
| 245   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64; | 249   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64; | 
| 246   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 250   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 
| 247   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64; | 251   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64; | 
| 248   output[1] = fdct_round_shift(t0); | 252   output[1] = fdct_round_shift(t0); | 
| 249   output[3] = fdct_round_shift(t2); | 253   output[3] = fdct_round_shift(t2); | 
| 250   output[5] = fdct_round_shift(t1); | 254   output[5] = fdct_round_shift(t1); | 
| 251   output[7] = fdct_round_shift(t3); | 255   output[7] = fdct_round_shift(t3); | 
| 252 } | 256 } | 
| 253 | 257 | 
| 254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { | 258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 255   int r, c; | 259   int r, c; | 
| 256   int16_t sum = 0; | 260   tran_low_t sum = 0; | 
| 257   for (r = 0; r < 8; ++r) | 261   for (r = 0; r < 8; ++r) | 
| 258     for (c = 0; c < 8; ++c) | 262     for (c = 0; c < 8; ++c) | 
| 259       sum += input[r * stride + c]; | 263       sum += input[r * stride + c]; | 
| 260 | 264 | 
| 261   output[0] = sum; | 265   output[0] = sum; | 
| 262   output[1] = 0; | 266   output[1] = 0; | 
| 263 } | 267 } | 
| 264 | 268 | 
| 265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { | 269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { | 
| 266   int i, j; | 270   int i, j; | 
| 267   int16_t intermediate[64]; | 271   tran_low_t intermediate[64]; | 
| 268 | 272 | 
| 269   // Transform columns | 273   // Transform columns | 
| 270   { | 274   { | 
| 271     int16_t *output = intermediate; | 275     tran_low_t *output = intermediate; | 
| 272     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 276     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16 | 
| 273     /*needs32*/ int t0, t1, t2, t3; | 277     tran_high_t t0, t1, t2, t3;                  // needs32 | 
| 274     /*canbe16*/ int x0, x1, x2, x3; | 278     tran_high_t x0, x1, x2, x3;                  // canbe16 | 
| 275 | 279 | 
| 276     int i; | 280     int i; | 
| 277     for (i = 0; i < 8; i++) { | 281     for (i = 0; i < 8; i++) { | 
| 278       // stage 1 | 282       // stage 1 | 
| 279       s0 = (input[0 * stride] + input[7 * stride]) * 4; | 283       s0 = (input[0 * stride] + input[7 * stride]) * 4; | 
| 280       s1 = (input[1 * stride] + input[6 * stride]) * 4; | 284       s1 = (input[1 * stride] + input[6 * stride]) * 4; | 
| 281       s2 = (input[2 * stride] + input[5 * stride]) * 4; | 285       s2 = (input[2 * stride] + input[5 * stride]) * 4; | 
| 282       s3 = (input[3 * stride] + input[4 * stride]) * 4; | 286       s3 = (input[3 * stride] + input[4 * stride]) * 4; | 
| 283       s4 = (input[3 * stride] - input[4 * stride]) * 4; | 287       s4 = (input[3 * stride] - input[4 * stride]) * 4; | 
| 284       s5 = (input[2 * stride] - input[5 * stride]) * 4; | 288       s5 = (input[2 * stride] - input[5 * stride]) * 4; | 
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 326   } | 330   } | 
| 327 | 331 | 
| 328   // Rows | 332   // Rows | 
| 329   for (i = 0; i < 8; ++i) { | 333   for (i = 0; i < 8; ++i) { | 
| 330     fdct8(&intermediate[i * 8], &final_output[i * 8]); | 334     fdct8(&intermediate[i * 8], &final_output[i * 8]); | 
| 331     for (j = 0; j < 8; ++j) | 335     for (j = 0; j < 8; ++j) | 
| 332       final_output[j + i * 8] /= 2; | 336       final_output[j + i * 8] /= 2; | 
| 333   } | 337   } | 
| 334 } | 338 } | 
| 335 | 339 | 
| 336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { | 340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 337   int r, c; | 341   int r, c; | 
| 338   int16_t sum = 0; | 342   tran_low_t sum = 0; | 
| 339   for (r = 0; r < 16; ++r) | 343   for (r = 0; r < 16; ++r) | 
| 340     for (c = 0; c < 16; ++c) | 344     for (c = 0; c < 16; ++c) | 
| 341       sum += input[r * stride + c]; | 345       sum += input[r * stride + c]; | 
| 342 | 346 | 
| 343   output[0] = sum >> 1; | 347   output[0] = sum >> 1; | 
| 344   output[1] = 0; | 348   output[1] = 0; | 
| 345 } | 349 } | 
| 346 | 350 | 
| 347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { | 351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 348   // The 2D transform is done with two passes which are actually pretty | 352   // The 2D transform is done with two passes which are actually pretty | 
| 349   // similar. In the first one, we transform the columns and transpose | 353   // similar. In the first one, we transform the columns and transpose | 
| 350   // the results. In the second one, we transform the rows. To achieve that, | 354   // the results. In the second one, we transform the rows. To achieve that, | 
| 351   // as the first pass results are transposed, we transpose the columns (that | 355   // as the first pass results are transposed, we transpose the columns (that | 
| 352   // is the transposed rows) and transpose the results (so that it goes back | 356   // is the transposed rows) and transpose the results (so that it goes back | 
| 353   // in normal/row positions). | 357   // in normal/row positions). | 
| 354   int pass; | 358   int pass; | 
| 355   // We need an intermediate buffer between passes. | 359   // We need an intermediate buffer between passes. | 
| 356   int16_t intermediate[256]; | 360   tran_low_t intermediate[256]; | 
| 357   const int16_t *in = input; | 361   const int16_t *in_pass0 = input; | 
| 358   int16_t *out = intermediate; | 362   const tran_low_t *in = NULL; | 
|  | 363   tran_low_t *out = intermediate; | 
| 359   // Do the two transform/transpose passes | 364   // Do the two transform/transpose passes | 
| 360   for (pass = 0; pass < 2; ++pass) { | 365   for (pass = 0; pass < 2; ++pass) { | 
| 361     /*canbe16*/ int step1[8]; | 366     tran_high_t step1[8];      // canbe16 | 
| 362     /*canbe16*/ int step2[8]; | 367     tran_high_t step2[8];      // canbe16 | 
| 363     /*canbe16*/ int step3[8]; | 368     tran_high_t step3[8];      // canbe16 | 
| 364     /*canbe16*/ int input[8]; | 369     tran_high_t input[8];      // canbe16 | 
| 365     /*needs32*/ int temp1, temp2; | 370     tran_high_t temp1, temp2;  // needs32 | 
| 366     int i; | 371     int i; | 
| 367     for (i = 0; i < 16; i++) { | 372     for (i = 0; i < 16; i++) { | 
| 368       if (0 == pass) { | 373       if (0 == pass) { | 
| 369         // Calculate input for the first 8 results. | 374         // Calculate input for the first 8 results. | 
| 370         input[0] = (in[0 * stride] + in[15 * stride]) * 4; | 375         input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; | 
| 371         input[1] = (in[1 * stride] + in[14 * stride]) * 4; | 376         input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; | 
| 372         input[2] = (in[2 * stride] + in[13 * stride]) * 4; | 377         input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; | 
| 373         input[3] = (in[3 * stride] + in[12 * stride]) * 4; | 378         input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; | 
| 374         input[4] = (in[4 * stride] + in[11 * stride]) * 4; | 379         input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; | 
| 375         input[5] = (in[5 * stride] + in[10 * stride]) * 4; | 380         input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; | 
| 376         input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; | 381         input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; | 
| 377         input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; | 382         input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; | 
| 378         // Calculate input for the next 8 results. | 383         // Calculate input for the next 8 results. | 
| 379         step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; | 384         step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; | 
| 380         step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; | 385         step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; | 
| 381         step1[2] = (in[5 * stride] - in[10 * stride]) * 4; | 386         step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; | 
| 382         step1[3] = (in[4 * stride] - in[11 * stride]) * 4; | 387         step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; | 
| 383         step1[4] = (in[3 * stride] - in[12 * stride]) * 4; | 388         step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; | 
| 384         step1[5] = (in[2 * stride] - in[13 * stride]) * 4; | 389         step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; | 
| 385         step1[6] = (in[1 * stride] - in[14 * stride]) * 4; | 390         step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; | 
| 386         step1[7] = (in[0 * stride] - in[15 * stride]) * 4; | 391         step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; | 
| 387       } else { | 392       } else { | 
| 388         // Calculate input for the first 8 results. | 393         // Calculate input for the first 8 results. | 
| 389         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 394         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 
| 390         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 395         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 
| 391         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 396         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 
| 392         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 397         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 
| 393         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 398         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 
| 394         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 399         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 
| 395         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 400         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 
| 396         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 401         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 
| 397         // Calculate input for the next 8 results. | 402         // Calculate input for the next 8 results. | 
| 398         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 403         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 
| 399         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 404         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 
| 400         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 405         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 
| 401         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 406         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 
| 402         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 407         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 
| 403         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 408         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 
| 404         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 409         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 
| 405         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 410         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 
| 406       } | 411       } | 
| 407       // Work on the first eight values; fdct8(input, even_results); | 412       // Work on the first eight values; fdct8(input, even_results); | 
| 408       { | 413       { | 
| 409         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 414         tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16 | 
| 410         /*needs32*/ int t0, t1, t2, t3; | 415         tran_high_t t0, t1, t2, t3;                  // needs32 | 
| 411         /*canbe16*/ int x0, x1, x2, x3; | 416         tran_high_t x0, x1, x2, x3;                  // canbe16 | 
| 412 | 417 | 
| 413         // stage 1 | 418         // stage 1 | 
| 414         s0 = input[0] + input[7]; | 419         s0 = input[0] + input[7]; | 
| 415         s1 = input[1] + input[6]; | 420         s1 = input[1] + input[6]; | 
| 416         s2 = input[2] + input[5]; | 421         s2 = input[2] + input[5]; | 
| 417         s3 = input[3] + input[4]; | 422         s3 = input[3] + input[4]; | 
| 418         s4 = input[3] - input[4]; | 423         s4 = input[3] - input[4]; | 
| 419         s5 = input[2] - input[5]; | 424         s5 = input[2] - input[5]; | 
| 420         s6 = input[1] - input[6]; | 425         s6 = input[1] - input[6]; | 
| 421         s7 = input[0] - input[7]; | 426         s7 = input[0] - input[7]; | 
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 507         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 512         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 
| 508         out[3] = fdct_round_shift(temp1); | 513         out[3] = fdct_round_shift(temp1); | 
| 509         out[11] = fdct_round_shift(temp2); | 514         out[11] = fdct_round_shift(temp2); | 
| 510         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 515         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 
| 511         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64; | 516         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64; | 
| 512         out[7] = fdct_round_shift(temp1); | 517         out[7] = fdct_round_shift(temp1); | 
| 513         out[15] = fdct_round_shift(temp2); | 518         out[15] = fdct_round_shift(temp2); | 
| 514       } | 519       } | 
| 515       // Do next column (which is a transposed row in second/horizontal pass) | 520       // Do next column (which is a transposed row in second/horizontal pass) | 
| 516       in++; | 521       in++; | 
|  | 522       in_pass0++; | 
| 517       out += 16; | 523       out += 16; | 
| 518     } | 524     } | 
| 519     // Setup in/out for next pass. | 525     // Setup in/out for next pass. | 
| 520     in = intermediate; | 526     in = intermediate; | 
| 521     out = output; | 527     out = output; | 
| 522   } | 528   } | 
| 523 } | 529 } | 
| 524 | 530 | 
| 525 static void fadst8(const int16_t *input, int16_t *output) { | 531 static void fadst8(const tran_low_t *input, tran_low_t *output) { | 
| 526   int s0, s1, s2, s3, s4, s5, s6, s7; | 532   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; | 
| 527 | 533 | 
| 528   int x0 = input[7]; | 534   tran_high_t x0 = input[7]; | 
| 529   int x1 = input[0]; | 535   tran_high_t x1 = input[0]; | 
| 530   int x2 = input[5]; | 536   tran_high_t x2 = input[5]; | 
| 531   int x3 = input[2]; | 537   tran_high_t x3 = input[2]; | 
| 532   int x4 = input[3]; | 538   tran_high_t x4 = input[3]; | 
| 533   int x5 = input[4]; | 539   tran_high_t x5 = input[4]; | 
| 534   int x6 = input[1]; | 540   tran_high_t x6 = input[1]; | 
| 535   int x7 = input[6]; | 541   tran_high_t x7 = input[6]; | 
| 536 | 542 | 
| 537   // stage 1 | 543   // stage 1 | 
| 538   s0 = cospi_2_64  * x0 + cospi_30_64 * x1; | 544   s0 = cospi_2_64  * x0 + cospi_30_64 * x1; | 
| 539   s1 = cospi_30_64 * x0 - cospi_2_64  * x1; | 545   s1 = cospi_30_64 * x0 - cospi_2_64  * x1; | 
| 540   s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | 546   s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | 
| 541   s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | 547   s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | 
| 542   s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | 548   s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | 
| 543   s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | 549   s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | 
| 544   s6 = cospi_26_64 * x6 + cospi_6_64  * x7; | 550   s6 = cospi_26_64 * x6 + cospi_6_64  * x7; | 
| 545   s7 = cospi_6_64  * x6 - cospi_26_64 * x7; | 551   s7 = cospi_6_64  * x6 - cospi_26_64 * x7; | 
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 593   output[7] = - x1; | 599   output[7] = - x1; | 
| 594 } | 600 } | 
| 595 | 601 | 
| 596 static const transform_2d FHT_8[] = { | 602 static const transform_2d FHT_8[] = { | 
| 597   { fdct8,  fdct8  },  // DCT_DCT  = 0 | 603   { fdct8,  fdct8  },  // DCT_DCT  = 0 | 
| 598   { fadst8, fdct8  },  // ADST_DCT = 1 | 604   { fadst8, fdct8  },  // ADST_DCT = 1 | 
| 599   { fdct8,  fadst8 },  // DCT_ADST = 2 | 605   { fdct8,  fadst8 },  // DCT_ADST = 2 | 
| 600   { fadst8, fadst8 }   // ADST_ADST = 3 | 606   { fadst8, fadst8 }   // ADST_ADST = 3 | 
| 601 }; | 607 }; | 
| 602 | 608 | 
| 603 void vp9_fht8x8_c(const int16_t *input, int16_t *output, | 609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, | 
| 604                   int stride, int tx_type) { | 610                   int stride, int tx_type) { | 
| 605   if (tx_type == DCT_DCT) { | 611   if (tx_type == DCT_DCT) { | 
| 606     vp9_fdct8x8_c(input, output, stride); | 612     vp9_fdct8x8_c(input, output, stride); | 
| 607   } else { | 613   } else { | 
| 608     int16_t out[64]; | 614     tran_low_t out[64]; | 
| 609     int16_t *outptr = &out[0]; | 615     tran_low_t *outptr = &out[0]; | 
| 610     int i, j; | 616     int i, j; | 
| 611     int16_t temp_in[8], temp_out[8]; | 617     tran_low_t temp_in[8], temp_out[8]; | 
| 612     const transform_2d ht = FHT_8[tx_type]; | 618     const transform_2d ht = FHT_8[tx_type]; | 
| 613 | 619 | 
| 614     // Columns | 620     // Columns | 
| 615     for (i = 0; i < 8; ++i) { | 621     for (i = 0; i < 8; ++i) { | 
| 616       for (j = 0; j < 8; ++j) | 622       for (j = 0; j < 8; ++j) | 
| 617         temp_in[j] = input[j * stride + i] * 4; | 623         temp_in[j] = input[j * stride + i] * 4; | 
| 618       ht.cols(temp_in, temp_out); | 624       ht.cols(temp_in, temp_out); | 
| 619       for (j = 0; j < 8; ++j) | 625       for (j = 0; j < 8; ++j) | 
| 620         outptr[j * 8 + i] = temp_out[j]; | 626         outptr[j * 8 + i] = temp_out[j]; | 
| 621     } | 627     } | 
| 622 | 628 | 
| 623     // Rows | 629     // Rows | 
| 624     for (i = 0; i < 8; ++i) { | 630     for (i = 0; i < 8; ++i) { | 
| 625       for (j = 0; j < 8; ++j) | 631       for (j = 0; j < 8; ++j) | 
| 626         temp_in[j] = out[j + i * 8]; | 632         temp_in[j] = out[j + i * 8]; | 
| 627       ht.rows(temp_in, temp_out); | 633       ht.rows(temp_in, temp_out); | 
| 628       for (j = 0; j < 8; ++j) | 634       for (j = 0; j < 8; ++j) | 
| 629         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 635         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 
| 630     } | 636     } | 
| 631   } | 637   } | 
| 632 } | 638 } | 
| 633 | 639 | 
| 634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 
| 635    pixel. */ | 641    pixel. */ | 
| 636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { | 642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 637   int i; | 643   int i; | 
| 638   int a1, b1, c1, d1, e1; | 644   tran_high_t a1, b1, c1, d1, e1; | 
| 639   const int16_t *ip = input; | 645   const int16_t *ip_pass0 = input; | 
| 640   int16_t *op = output; | 646   const tran_low_t *ip = NULL; | 
|  | 647   tran_low_t *op = output; | 
| 641 | 648 | 
| 642   for (i = 0; i < 4; i++) { | 649   for (i = 0; i < 4; i++) { | 
| 643     a1 = ip[0 * stride]; | 650     a1 = ip_pass0[0 * stride]; | 
| 644     b1 = ip[1 * stride]; | 651     b1 = ip_pass0[1 * stride]; | 
| 645     c1 = ip[2 * stride]; | 652     c1 = ip_pass0[2 * stride]; | 
| 646     d1 = ip[3 * stride]; | 653     d1 = ip_pass0[3 * stride]; | 
| 647 | 654 | 
| 648     a1 += b1; | 655     a1 += b1; | 
| 649     d1 = d1 - c1; | 656     d1 = d1 - c1; | 
| 650     e1 = (a1 - d1) >> 1; | 657     e1 = (a1 - d1) >> 1; | 
| 651     b1 = e1 - b1; | 658     b1 = e1 - b1; | 
| 652     c1 = e1 - c1; | 659     c1 = e1 - c1; | 
| 653     a1 -= c1; | 660     a1 -= c1; | 
| 654     d1 += b1; | 661     d1 += b1; | 
| 655     op[0] = a1; | 662     op[0] = a1; | 
| 656     op[4] = c1; | 663     op[4] = c1; | 
| 657     op[8] = d1; | 664     op[8] = d1; | 
| 658     op[12] = b1; | 665     op[12] = b1; | 
| 659 | 666 | 
| 660     ip++; | 667     ip_pass0++; | 
| 661     op++; | 668     op++; | 
| 662   } | 669   } | 
| 663   ip = output; | 670   ip = output; | 
| 664   op = output; | 671   op = output; | 
| 665 | 672 | 
| 666   for (i = 0; i < 4; i++) { | 673   for (i = 0; i < 4; i++) { | 
| 667     a1 = ip[0]; | 674     a1 = ip[0]; | 
| 668     b1 = ip[1]; | 675     b1 = ip[1]; | 
| 669     c1 = ip[2]; | 676     c1 = ip[2]; | 
| 670     d1 = ip[3]; | 677     d1 = ip[3]; | 
| 671 | 678 | 
| 672     a1 += b1; | 679     a1 += b1; | 
| 673     d1 -= c1; | 680     d1 -= c1; | 
| 674     e1 = (a1 - d1) >> 1; | 681     e1 = (a1 - d1) >> 1; | 
| 675     b1 = e1 - b1; | 682     b1 = e1 - b1; | 
| 676     c1 = e1 - c1; | 683     c1 = e1 - c1; | 
| 677     a1 -= c1; | 684     a1 -= c1; | 
| 678     d1 += b1; | 685     d1 += b1; | 
| 679     op[0] = a1 * UNIT_QUANT_FACTOR; | 686     op[0] = a1 * UNIT_QUANT_FACTOR; | 
| 680     op[1] = c1 * UNIT_QUANT_FACTOR; | 687     op[1] = c1 * UNIT_QUANT_FACTOR; | 
| 681     op[2] = d1 * UNIT_QUANT_FACTOR; | 688     op[2] = d1 * UNIT_QUANT_FACTOR; | 
| 682     op[3] = b1 * UNIT_QUANT_FACTOR; | 689     op[3] = b1 * UNIT_QUANT_FACTOR; | 
| 683 | 690 | 
| 684     ip += 4; | 691     ip += 4; | 
| 685     op += 4; | 692     op += 4; | 
| 686   } | 693   } | 
| 687 } | 694 } | 
| 688 | 695 | 
| 689 // Rewrote to use same algorithm as others. | 696 // Rewrote to use same algorithm as others. | 
| 690 static void fdct16(const int16_t in[16], int16_t out[16]) { | 697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { | 
| 691   /*canbe16*/ int step1[8]; | 698   tran_high_t step1[8];      // canbe16 | 
| 692   /*canbe16*/ int step2[8]; | 699   tran_high_t step2[8];      // canbe16 | 
| 693   /*canbe16*/ int step3[8]; | 700   tran_high_t step3[8];      // canbe16 | 
| 694   /*canbe16*/ int input[8]; | 701   tran_high_t input[8];      // canbe16 | 
| 695   /*needs32*/ int temp1, temp2; | 702   tran_high_t temp1, temp2;  // needs32 | 
| 696 | 703 | 
| 697   // step 1 | 704   // step 1 | 
| 698   input[0] = in[0] + in[15]; | 705   input[0] = in[0] + in[15]; | 
| 699   input[1] = in[1] + in[14]; | 706   input[1] = in[1] + in[14]; | 
| 700   input[2] = in[2] + in[13]; | 707   input[2] = in[2] + in[13]; | 
| 701   input[3] = in[3] + in[12]; | 708   input[3] = in[3] + in[12]; | 
| 702   input[4] = in[4] + in[11]; | 709   input[4] = in[4] + in[11]; | 
| 703   input[5] = in[5] + in[10]; | 710   input[5] = in[5] + in[10]; | 
| 704   input[6] = in[6] + in[ 9]; | 711   input[6] = in[6] + in[ 9]; | 
| 705   input[7] = in[7] + in[ 8]; | 712   input[7] = in[7] + in[ 8]; | 
| 706 | 713 | 
| 707   step1[0] = in[7] - in[ 8]; | 714   step1[0] = in[7] - in[ 8]; | 
| 708   step1[1] = in[6] - in[ 9]; | 715   step1[1] = in[6] - in[ 9]; | 
| 709   step1[2] = in[5] - in[10]; | 716   step1[2] = in[5] - in[10]; | 
| 710   step1[3] = in[4] - in[11]; | 717   step1[3] = in[4] - in[11]; | 
| 711   step1[4] = in[3] - in[12]; | 718   step1[4] = in[3] - in[12]; | 
| 712   step1[5] = in[2] - in[13]; | 719   step1[5] = in[2] - in[13]; | 
| 713   step1[6] = in[1] - in[14]; | 720   step1[6] = in[1] - in[14]; | 
| 714   step1[7] = in[0] - in[15]; | 721   step1[7] = in[0] - in[15]; | 
| 715 | 722 | 
| 716   // fdct8(step, step); | 723   // fdct8(step, step); | 
| 717   { | 724   { | 
| 718     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 725     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16 | 
| 719     /*needs32*/ int t0, t1, t2, t3; | 726     tran_high_t t0, t1, t2, t3;                  // needs32 | 
| 720     /*canbe16*/ int x0, x1, x2, x3; | 727     tran_high_t x0, x1, x2, x3;                  // canbe16 | 
| 721 | 728 | 
| 722     // stage 1 | 729     // stage 1 | 
| 723     s0 = input[0] + input[7]; | 730     s0 = input[0] + input[7]; | 
| 724     s1 = input[1] + input[6]; | 731     s1 = input[1] + input[6]; | 
| 725     s2 = input[2] + input[5]; | 732     s2 = input[2] + input[5]; | 
| 726     s3 = input[3] + input[4]; | 733     s3 = input[3] + input[4]; | 
| 727     s4 = input[3] - input[4]; | 734     s4 = input[3] - input[4]; | 
| 728     s5 = input[2] - input[5]; | 735     s5 = input[2] - input[5]; | 
| 729     s6 = input[1] - input[6]; | 736     s6 = input[1] - input[6]; | 
| 730     s7 = input[0] - input[7]; | 737     s7 = input[0] - input[7]; | 
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 821   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 828   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 
| 822   out[3] = fdct_round_shift(temp1); | 829   out[3] = fdct_round_shift(temp1); | 
| 823   out[11] = fdct_round_shift(temp2); | 830   out[11] = fdct_round_shift(temp2); | 
| 824 | 831 | 
| 825   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 832   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 
| 826   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64; | 833   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64; | 
| 827   out[7] = fdct_round_shift(temp1); | 834   out[7] = fdct_round_shift(temp1); | 
| 828   out[15] = fdct_round_shift(temp2); | 835   out[15] = fdct_round_shift(temp2); | 
| 829 } | 836 } | 
| 830 | 837 | 
| 831 static void fadst16(const int16_t *input, int16_t *output) { | 838 static void fadst16(const tran_low_t *input, tran_low_t *output) { | 
| 832   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 839   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; | 
|  | 840   tran_high_t s9, s10, s11, s12, s13, s14, s15; | 
| 833 | 841 | 
| 834   int x0 = input[15]; | 842   tran_high_t x0 = input[15]; | 
| 835   int x1 = input[0]; | 843   tran_high_t x1 = input[0]; | 
| 836   int x2 = input[13]; | 844   tran_high_t x2 = input[13]; | 
| 837   int x3 = input[2]; | 845   tran_high_t x3 = input[2]; | 
| 838   int x4 = input[11]; | 846   tran_high_t x4 = input[11]; | 
| 839   int x5 = input[4]; | 847   tran_high_t x5 = input[4]; | 
| 840   int x6 = input[9]; | 848   tran_high_t x6 = input[9]; | 
| 841   int x7 = input[6]; | 849   tran_high_t x7 = input[6]; | 
| 842   int x8 = input[7]; | 850   tran_high_t x8 = input[7]; | 
| 843   int x9 = input[8]; | 851   tran_high_t x9 = input[8]; | 
| 844   int x10 = input[5]; | 852   tran_high_t x10 = input[5]; | 
| 845   int x11 = input[10]; | 853   tran_high_t x11 = input[10]; | 
| 846   int x12 = input[3]; | 854   tran_high_t x12 = input[3]; | 
| 847   int x13 = input[12]; | 855   tran_high_t x13 = input[12]; | 
| 848   int x14 = input[1]; | 856   tran_high_t x14 = input[1]; | 
| 849   int x15 = input[14]; | 857   tran_high_t x15 = input[14]; | 
| 850 | 858 | 
| 851   // stage 1 | 859   // stage 1 | 
| 852   s0 = x0 * cospi_1_64  + x1 * cospi_31_64; | 860   s0 = x0 * cospi_1_64  + x1 * cospi_31_64; | 
| 853   s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | 861   s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | 
| 854   s2 = x2 * cospi_5_64  + x3 * cospi_27_64; | 862   s2 = x2 * cospi_5_64  + x3 * cospi_27_64; | 
| 855   s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | 863   s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | 
| 856   s4 = x4 * cospi_9_64  + x5 * cospi_23_64; | 864   s4 = x4 * cospi_9_64  + x5 * cospi_23_64; | 
| 857   s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | 865   s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | 
| 858   s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | 866   s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | 
| 859   s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | 867   s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | 
| (...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 990   output[15] = - x1; | 998   output[15] = - x1; | 
| 991 } | 999 } | 
| 992 | 1000 | 
| 993 static const transform_2d FHT_16[] = { | 1001 static const transform_2d FHT_16[] = { | 
| 994   { fdct16,  fdct16  },  // DCT_DCT  = 0 | 1002   { fdct16,  fdct16  },  // DCT_DCT  = 0 | 
| 995   { fadst16, fdct16  },  // ADST_DCT = 1 | 1003   { fadst16, fdct16  },  // ADST_DCT = 1 | 
| 996   { fdct16,  fadst16 },  // DCT_ADST = 2 | 1004   { fdct16,  fadst16 },  // DCT_ADST = 2 | 
| 997   { fadst16, fadst16 }   // ADST_ADST = 3 | 1005   { fadst16, fadst16 }   // ADST_ADST = 3 | 
| 998 }; | 1006 }; | 
| 999 | 1007 | 
| 1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output, | 1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, | 
| 1001                     int stride, int tx_type) { | 1009                     int stride, int tx_type) { | 
| 1002   if (tx_type == DCT_DCT) { | 1010   if (tx_type == DCT_DCT) { | 
| 1003     vp9_fdct16x16_c(input, output, stride); | 1011     vp9_fdct16x16_c(input, output, stride); | 
| 1004   } else { | 1012   } else { | 
| 1005     int16_t out[256]; | 1013     tran_low_t out[256]; | 
| 1006     int16_t *outptr = &out[0]; | 1014     tran_low_t *outptr = &out[0]; | 
| 1007     int i, j; | 1015     int i, j; | 
| 1008     int16_t temp_in[16], temp_out[16]; | 1016     tran_low_t temp_in[16], temp_out[16]; | 
| 1009     const transform_2d ht = FHT_16[tx_type]; | 1017     const transform_2d ht = FHT_16[tx_type]; | 
| 1010 | 1018 | 
| 1011     // Columns | 1019     // Columns | 
| 1012     for (i = 0; i < 16; ++i) { | 1020     for (i = 0; i < 16; ++i) { | 
| 1013       for (j = 0; j < 16; ++j) | 1021       for (j = 0; j < 16; ++j) | 
| 1014         temp_in[j] = input[j * stride + i] * 4; | 1022         temp_in[j] = input[j * stride + i] * 4; | 
| 1015       ht.cols(temp_in, temp_out); | 1023       ht.cols(temp_in, temp_out); | 
| 1016       for (j = 0; j < 16; ++j) | 1024       for (j = 0; j < 16; ++j) | 
| 1017         outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1025         outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 
| 1018     } | 1026     } | 
| 1019 | 1027 | 
| 1020     // Rows | 1028     // Rows | 
| 1021     for (i = 0; i < 16; ++i) { | 1029     for (i = 0; i < 16; ++i) { | 
| 1022       for (j = 0; j < 16; ++j) | 1030       for (j = 0; j < 16; ++j) | 
| 1023         temp_in[j] = out[j + i * 16]; | 1031         temp_in[j] = out[j + i * 16]; | 
| 1024       ht.rows(temp_in, temp_out); | 1032       ht.rows(temp_in, temp_out); | 
| 1025       for (j = 0; j < 16; ++j) | 1033       for (j = 0; j < 16; ++j) | 
| 1026         output[j + i * 16] = temp_out[j]; | 1034         output[j + i * 16] = temp_out[j]; | 
| 1027     } | 1035     } | 
| 1028   } | 1036   } | 
| 1029 } | 1037 } | 
| 1030 | 1038 | 
| 1031 static INLINE int dct_32_round(int input) { | 1039 static INLINE tran_high_t dct_32_round(tran_high_t input) { | 
| 1032   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 1040   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 
| 1033   assert(-131072 <= rv && rv <= 131071); | 1041   // TODO(debargha, peter.derivaz): Find new bounds for this assert, | 
|  | 1042   // and make the bounds consts. | 
|  | 1043   // assert(-131072 <= rv && rv <= 131071); | 
| 1034   return rv; | 1044   return rv; | 
| 1035 } | 1045 } | 
| 1036 | 1046 | 
| 1037 static INLINE int half_round_shift(int input) { | 1047 static INLINE tran_high_t half_round_shift(tran_high_t input) { | 
| 1038   int rv = (input + 1 + (input < 0)) >> 2; | 1048   tran_high_t rv = (input + 1 + (input < 0)) >> 2; | 
| 1039   return rv; | 1049   return rv; | 
| 1040 } | 1050 } | 
| 1041 | 1051 | 
| 1042 static void fdct32(const int *input, int *output, int round) { | 1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) { | 
| 1043   int step[32]; | 1053   tran_high_t step[32]; | 
| 1044   // Stage 1 | 1054   // Stage 1 | 
| 1045   step[0] = input[0] + input[(32 - 1)]; | 1055   step[0] = input[0] + input[(32 - 1)]; | 
| 1046   step[1] = input[1] + input[(32 - 2)]; | 1056   step[1] = input[1] + input[(32 - 2)]; | 
| 1047   step[2] = input[2] + input[(32 - 3)]; | 1057   step[2] = input[2] + input[(32 - 3)]; | 
| 1048   step[3] = input[3] + input[(32 - 4)]; | 1058   step[3] = input[3] + input[(32 - 4)]; | 
| 1049   step[4] = input[4] + input[(32 - 5)]; | 1059   step[4] = input[4] + input[(32 - 5)]; | 
| 1050   step[5] = input[5] + input[(32 - 6)]; | 1060   step[5] = input[5] + input[(32 - 6)]; | 
| 1051   step[6] = input[6] + input[(32 - 7)]; | 1061   step[6] = input[6] + input[(32 - 7)]; | 
| 1052   step[7] = input[7] + input[(32 - 8)]; | 1062   step[7] = input[7] + input[(32 - 8)]; | 
| 1053   step[8] = input[8] + input[(32 - 9)]; | 1063   step[8] = input[8] + input[(32 - 9)]; | 
| (...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1355   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 1365   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 
| 1356   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 1366   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 
| 1357   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 1367   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 
| 1358   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 1368   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 
| 1359   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 1369   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 
| 1360   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 1370   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 
| 1361   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 1371   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 
| 1362   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 1372   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 
| 1363 } | 1373 } | 
| 1364 | 1374 | 
| 1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { | 1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { | 
| 1366   int r, c; | 1376   int r, c; | 
| 1367   int16_t sum = 0; | 1377   tran_low_t sum = 0; | 
| 1368   for (r = 0; r < 32; ++r) | 1378   for (r = 0; r < 32; ++r) | 
| 1369     for (c = 0; c < 32; ++c) | 1379     for (c = 0; c < 32; ++c) | 
| 1370       sum += input[r * stride + c]; | 1380       sum += input[r * stride + c]; | 
| 1371 | 1381 | 
| 1372   output[0] = sum >> 3; | 1382   output[0] = sum >> 3; | 
| 1373   output[1] = 0; | 1383   output[1] = 0; | 
| 1374 } | 1384 } | 
| 1375 | 1385 | 
| 1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { | 1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { | 
| 1377   int i, j; | 1387   int i, j; | 
| 1378   int output[32 * 32]; | 1388   tran_high_t output[32 * 32]; | 
| 1379 | 1389 | 
| 1380   // Columns | 1390   // Columns | 
| 1381   for (i = 0; i < 32; ++i) { | 1391   for (i = 0; i < 32; ++i) { | 
| 1382     int temp_in[32], temp_out[32]; | 1392     tran_high_t temp_in[32], temp_out[32]; | 
| 1383     for (j = 0; j < 32; ++j) | 1393     for (j = 0; j < 32; ++j) | 
| 1384       temp_in[j] = input[j * stride + i] * 4; | 1394       temp_in[j] = input[j * stride + i] * 4; | 
| 1385     fdct32(temp_in, temp_out, 0); | 1395     fdct32(temp_in, temp_out, 0); | 
| 1386     for (j = 0; j < 32; ++j) | 1396     for (j = 0; j < 32; ++j) | 
| 1387       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1397       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 
| 1388   } | 1398   } | 
| 1389 | 1399 | 
| 1390   // Rows | 1400   // Rows | 
| 1391   for (i = 0; i < 32; ++i) { | 1401   for (i = 0; i < 32; ++i) { | 
| 1392     int temp_in[32], temp_out[32]; | 1402     tran_high_t temp_in[32], temp_out[32]; | 
| 1393     for (j = 0; j < 32; ++j) | 1403     for (j = 0; j < 32; ++j) | 
| 1394       temp_in[j] = output[j + i * 32]; | 1404       temp_in[j] = output[j + i * 32]; | 
| 1395     fdct32(temp_in, temp_out, 0); | 1405     fdct32(temp_in, temp_out, 0); | 
| 1396     for (j = 0; j < 32; ++j) | 1406     for (j = 0; j < 32; ++j) | 
| 1397       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1407       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 
| 1398   } | 1408   } | 
| 1399 } | 1409 } | 
| 1400 | 1410 | 
| 1401 // Note that although we use dct_32_round in dct32 computation flow, | 1411 // Note that although we use dct_32_round in dct32 computation flow, | 
| 1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 
| 1403 // within 16 bits precision. | 1413 // within 16 bits precision. | 
| 1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { | 1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { | 
| 1405   int i, j; | 1415   int i, j; | 
| 1406   int output[32 * 32]; | 1416   tran_high_t output[32 * 32]; | 
| 1407 | 1417 | 
| 1408   // Columns | 1418   // Columns | 
| 1409   for (i = 0; i < 32; ++i) { | 1419   for (i = 0; i < 32; ++i) { | 
| 1410     int temp_in[32], temp_out[32]; | 1420     tran_high_t temp_in[32], temp_out[32]; | 
| 1411     for (j = 0; j < 32; ++j) | 1421     for (j = 0; j < 32; ++j) | 
| 1412       temp_in[j] = input[j * stride + i] * 4; | 1422       temp_in[j] = input[j * stride + i] * 4; | 
| 1413     fdct32(temp_in, temp_out, 0); | 1423     fdct32(temp_in, temp_out, 0); | 
| 1414     for (j = 0; j < 32; ++j) | 1424     for (j = 0; j < 32; ++j) | 
| 1415       // TODO(cd): see quality impact of only doing | 1425       // TODO(cd): see quality impact of only doing | 
| 1416       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 1426       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 
| 1417       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 1427       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 
| 1418       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1428       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 
| 1419   } | 1429   } | 
| 1420 | 1430 | 
| 1421   // Rows | 1431   // Rows | 
| 1422   for (i = 0; i < 32; ++i) { | 1432   for (i = 0; i < 32; ++i) { | 
| 1423     int temp_in[32], temp_out[32]; | 1433     tran_high_t temp_in[32], temp_out[32]; | 
| 1424     for (j = 0; j < 32; ++j) | 1434     for (j = 0; j < 32; ++j) | 
| 1425       temp_in[j] = output[j + i * 32]; | 1435       temp_in[j] = output[j + i * 32]; | 
| 1426     fdct32(temp_in, temp_out, 1); | 1436     fdct32(temp_in, temp_out, 1); | 
| 1427     for (j = 0; j < 32; ++j) | 1437     for (j = 0; j < 32; ++j) | 
| 1428       out[j + i * 32] = temp_out[j]; | 1438       out[j + i * 32] = temp_out[j]; | 
| 1429   } | 1439   } | 
| 1430 } | 1440 } | 
|  | 1441 | 
|  | 1442 #if CONFIG_VP9_HIGHBITDEPTH | 
|  | 1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { | 
|  | 1444   vp9_fdct4x4_c(input, output, stride); | 
|  | 1445 } | 
|  | 1446 | 
|  | 1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output, | 
|  | 1448                        int stride, int tx_type) { | 
|  | 1449   vp9_fht4x4_c(input, output, stride, tx_type); | 
|  | 1450 } | 
|  | 1451 | 
|  | 1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, | 
|  | 1453                           int stride) { | 
|  | 1454   vp9_fdct8x8_1_c(input, final_output, stride); | 
|  | 1455 } | 
|  | 1456 | 
|  | 1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output, | 
|  | 1458                         int stride) { | 
|  | 1459   vp9_fdct8x8_c(input, final_output, stride); | 
|  | 1460 } | 
|  | 1461 | 
|  | 1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output, | 
|  | 1463                             int stride) { | 
|  | 1464   vp9_fdct16x16_1_c(input, output, stride); | 
|  | 1465 } | 
|  | 1466 | 
|  | 1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output, | 
|  | 1468                           int stride) { | 
|  | 1469   vp9_fdct16x16_c(input, output, stride); | 
|  | 1470 } | 
|  | 1471 | 
|  | 1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output, | 
|  | 1473                   int stride, int tx_type) { | 
|  | 1474   vp9_fht8x8_c(input, output, stride, tx_type); | 
|  | 1475 } | 
|  | 1476 | 
|  | 1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { | 
|  | 1478   vp9_fwht4x4_c(input, output, stride); | 
|  | 1479 } | 
|  | 1480 | 
|  | 1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output, | 
|  | 1482                     int stride, int tx_type) { | 
|  | 1483   vp9_fht16x16_c(input, output, stride, tx_type); | 
|  | 1484 } | 
|  | 1485 | 
|  | 1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { | 
|  | 1487   vp9_fdct32x32_1_c(input, out, stride); | 
|  | 1488 } | 
|  | 1489 | 
|  | 1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { | 
|  | 1491   vp9_fdct32x32_c(input, out, stride); | 
|  | 1492 } | 
|  | 1493 | 
|  | 1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, | 
|  | 1495                              int stride) { | 
|  | 1496   vp9_fdct32x32_rd_c(input, out, stride); | 
|  | 1497 } | 
|  | 1498 #endif  // CONFIG_VP9_HIGHBITDEPTH | 
| OLD | NEW | 
|---|