| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include <assert.h> |
| 12 #include <math.h> | 12 #include <math.h> |
| 13 | 13 |
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
| 16 | 16 |
| 17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
| 18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
| 19 #include "vp9/common/vp9_systemdependent.h" | 19 #include "vp9/common/vp9_systemdependent.h" |
| 20 | 20 |
| 21 static INLINE int fdct_round_shift(int input) { | 21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) { |
| 22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
| 23 assert(INT16_MIN <= rv && rv <= INT16_MAX); | 23 // TODO(debargha, peter.derivaz): Find new bounds for this assert |
| 24 // and make the bounds consts. |
| 25 // assert(INT16_MIN <= rv && rv <= INT16_MAX); |
| 24 return rv; | 26 return rv; |
| 25 } | 27 } |
| 26 | 28 |
| 27 static void fdct4(const int16_t *input, int16_t *output) { | 29 static void fdct4(const tran_low_t *input, tran_low_t *output) { |
| 28 int16_t step[4]; | 30 tran_high_t step[4]; |
| 29 int temp1, temp2; | 31 tran_high_t temp1, temp2; |
| 30 | 32 |
| 31 step[0] = input[0] + input[3]; | 33 step[0] = input[0] + input[3]; |
| 32 step[1] = input[1] + input[2]; | 34 step[1] = input[1] + input[2]; |
| 33 step[2] = input[1] - input[2]; | 35 step[2] = input[1] - input[2]; |
| 34 step[3] = input[0] - input[3]; | 36 step[3] = input[0] - input[3]; |
| 35 | 37 |
| 36 temp1 = (step[0] + step[1]) * cospi_16_64; | 38 temp1 = (step[0] + step[1]) * cospi_16_64; |
| 37 temp2 = (step[0] - step[1]) * cospi_16_64; | 39 temp2 = (step[0] - step[1]) * cospi_16_64; |
| 38 output[0] = fdct_round_shift(temp1); | 40 output[0] = fdct_round_shift(temp1); |
| 39 output[2] = fdct_round_shift(temp2); | 41 output[2] = fdct_round_shift(temp2); |
| 40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
| 41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
| 42 output[1] = fdct_round_shift(temp1); | 44 output[1] = fdct_round_shift(temp1); |
| 43 output[3] = fdct_round_shift(temp2); | 45 output[3] = fdct_round_shift(temp2); |
| 44 } | 46 } |
| 45 | 47 |
| 46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { | 48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { |
| 47 int r, c; | 49 int r, c; |
| 48 int16_t sum = 0; | 50 tran_low_t sum = 0; |
| 49 for (r = 0; r < 4; ++r) | 51 for (r = 0; r < 4; ++r) |
| 50 for (c = 0; c < 4; ++c) | 52 for (c = 0; c < 4; ++c) |
| 51 sum += input[r * stride + c]; | 53 sum += input[r * stride + c]; |
| 52 | 54 |
| 53 output[0] = sum << 1; | 55 output[0] = sum << 1; |
| 54 output[1] = 0; | 56 output[1] = 0; |
| 55 } | 57 } |
| 56 | 58 |
| 57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { | 59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 58 // The 2D transform is done with two passes which are actually pretty | 60 // The 2D transform is done with two passes which are actually pretty |
| 59 // similar. In the first one, we transform the columns and transpose | 61 // similar. In the first one, we transform the columns and transpose |
| 60 // the results. In the second one, we transform the rows. To achieve that, | 62 // the results. In the second one, we transform the rows. To achieve that, |
| 61 // as the first pass results are transposed, we transpose the columns (that | 63 // as the first pass results are transposed, we transpose the columns (that |
| 62 // is the transposed rows) and transpose the results (so that it goes back | 64 // is the transposed rows) and transpose the results (so that it goes back |
| 63 // in normal/row positions). | 65 // in normal/row positions). |
| 64 int pass; | 66 int pass; |
| 65 // We need an intermediate buffer between passes. | 67 // We need an intermediate buffer between passes. |
| 66 int16_t intermediate[4 * 4]; | 68 tran_low_t intermediate[4 * 4]; |
| 67 const int16_t *in = input; | 69 const int16_t *in_pass0 = input; |
| 68 int16_t *out = intermediate; | 70 const tran_low_t *in = NULL; |
| 71 tran_low_t *out = intermediate; |
| 69 // Do the two transform/transpose passes | 72 // Do the two transform/transpose passes |
| 70 for (pass = 0; pass < 2; ++pass) { | 73 for (pass = 0; pass < 2; ++pass) { |
| 71 /*canbe16*/ int input[4]; | 74 tran_high_t input[4]; // canbe16 |
| 72 /*canbe16*/ int step[4]; | 75 tran_high_t step[4]; // canbe16 |
| 73 /*needs32*/ int temp1, temp2; | 76 tran_high_t temp1, temp2; // needs32 |
| 74 int i; | 77 int i; |
| 75 for (i = 0; i < 4; ++i) { | 78 for (i = 0; i < 4; ++i) { |
| 76 // Load inputs. | 79 // Load inputs. |
| 77 if (0 == pass) { | 80 if (0 == pass) { |
| 78 input[0] = in[0 * stride] * 16; | 81 input[0] = in_pass0[0 * stride] * 16; |
| 79 input[1] = in[1 * stride] * 16; | 82 input[1] = in_pass0[1 * stride] * 16; |
| 80 input[2] = in[2 * stride] * 16; | 83 input[2] = in_pass0[2 * stride] * 16; |
| 81 input[3] = in[3 * stride] * 16; | 84 input[3] = in_pass0[3 * stride] * 16; |
| 82 if (i == 0 && input[0]) { | 85 if (i == 0 && input[0]) { |
| 83 input[0] += 1; | 86 input[0] += 1; |
| 84 } | 87 } |
| 85 } else { | 88 } else { |
| 86 input[0] = in[0 * 4]; | 89 input[0] = in[0 * 4]; |
| 87 input[1] = in[1 * 4]; | 90 input[1] = in[1 * 4]; |
| 88 input[2] = in[2 * 4]; | 91 input[2] = in[2 * 4]; |
| 89 input[3] = in[3 * 4]; | 92 input[3] = in[3 * 4]; |
| 90 } | 93 } |
| 91 // Transform. | 94 // Transform. |
| 92 step[0] = input[0] + input[3]; | 95 step[0] = input[0] + input[3]; |
| 93 step[1] = input[1] + input[2]; | 96 step[1] = input[1] + input[2]; |
| 94 step[2] = input[1] - input[2]; | 97 step[2] = input[1] - input[2]; |
| 95 step[3] = input[0] - input[3]; | 98 step[3] = input[0] - input[3]; |
| 96 temp1 = (step[0] + step[1]) * cospi_16_64; | 99 temp1 = (step[0] + step[1]) * cospi_16_64; |
| 97 temp2 = (step[0] - step[1]) * cospi_16_64; | 100 temp2 = (step[0] - step[1]) * cospi_16_64; |
| 98 out[0] = fdct_round_shift(temp1); | 101 out[0] = fdct_round_shift(temp1); |
| 99 out[2] = fdct_round_shift(temp2); | 102 out[2] = fdct_round_shift(temp2); |
| 100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
| 101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
| 102 out[1] = fdct_round_shift(temp1); | 105 out[1] = fdct_round_shift(temp1); |
| 103 out[3] = fdct_round_shift(temp2); | 106 out[3] = fdct_round_shift(temp2); |
| 104 // Do next column (which is a transposed row in second/horizontal pass) | 107 // Do next column (which is a transposed row in second/horizontal pass) |
| 108 in_pass0++; |
| 105 in++; | 109 in++; |
| 106 out += 4; | 110 out += 4; |
| 107 } | 111 } |
| 108 // Setup in/out for next pass. | 112 // Setup in/out for next pass. |
| 109 in = intermediate; | 113 in = intermediate; |
| 110 out = output; | 114 out = output; |
| 111 } | 115 } |
| 112 | 116 |
| 113 { | 117 { |
| 114 int i, j; | 118 int i, j; |
| 115 for (i = 0; i < 4; ++i) { | 119 for (i = 0; i < 4; ++i) { |
| 116 for (j = 0; j < 4; ++j) | 120 for (j = 0; j < 4; ++j) |
| 117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; |
| 118 } | 122 } |
| 119 } | 123 } |
| 120 } | 124 } |
| 121 | 125 |
| 122 static void fadst4(const int16_t *input, int16_t *output) { | 126 static void fadst4(const tran_low_t *input, tran_low_t *output) { |
| 123 int x0, x1, x2, x3; | 127 tran_high_t x0, x1, x2, x3; |
| 124 int s0, s1, s2, s3, s4, s5, s6, s7; | 128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
| 125 | 129 |
| 126 x0 = input[0]; | 130 x0 = input[0]; |
| 127 x1 = input[1]; | 131 x1 = input[1]; |
| 128 x2 = input[2]; | 132 x2 = input[2]; |
| 129 x3 = input[3]; | 133 x3 = input[3]; |
| 130 | 134 |
| 131 if (!(x0 | x1 | x2 | x3)) { | 135 if (!(x0 | x1 | x2 | x3)) { |
| 132 output[0] = output[1] = output[2] = output[3] = 0; | 136 output[0] = output[1] = output[2] = output[3] = 0; |
| 133 return; | 137 return; |
| 134 } | 138 } |
| (...skipping 24 matching lines...) Expand all Loading... |
| 159 output[3] = fdct_round_shift(s3); | 163 output[3] = fdct_round_shift(s3); |
| 160 } | 164 } |
| 161 | 165 |
| 162 static const transform_2d FHT_4[] = { | 166 static const transform_2d FHT_4[] = { |
| 163 { fdct4, fdct4 }, // DCT_DCT = 0 | 167 { fdct4, fdct4 }, // DCT_DCT = 0 |
| 164 { fadst4, fdct4 }, // ADST_DCT = 1 | 168 { fadst4, fdct4 }, // ADST_DCT = 1 |
| 165 { fdct4, fadst4 }, // DCT_ADST = 2 | 169 { fdct4, fadst4 }, // DCT_ADST = 2 |
| 166 { fadst4, fadst4 } // ADST_ADST = 3 | 170 { fadst4, fadst4 } // ADST_ADST = 3 |
| 167 }; | 171 }; |
| 168 | 172 |
| 169 void vp9_fht4x4_c(const int16_t *input, int16_t *output, | 173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, |
| 170 int stride, int tx_type) { | 174 int stride, int tx_type) { |
| 171 if (tx_type == DCT_DCT) { | 175 if (tx_type == DCT_DCT) { |
| 172 vp9_fdct4x4_c(input, output, stride); | 176 vp9_fdct4x4_c(input, output, stride); |
| 173 } else { | 177 } else { |
| 174 int16_t out[4 * 4]; | 178 tran_low_t out[4 * 4]; |
| 175 int16_t *outptr = &out[0]; | 179 tran_low_t *outptr = &out[0]; |
| 176 int i, j; | 180 int i, j; |
| 177 int16_t temp_in[4], temp_out[4]; | 181 tran_low_t temp_in[4], temp_out[4]; |
| 178 const transform_2d ht = FHT_4[tx_type]; | 182 const transform_2d ht = FHT_4[tx_type]; |
| 179 | 183 |
| 180 // Columns | 184 // Columns |
| 181 for (i = 0; i < 4; ++i) { | 185 for (i = 0; i < 4; ++i) { |
| 182 for (j = 0; j < 4; ++j) | 186 for (j = 0; j < 4; ++j) |
| 183 temp_in[j] = input[j * stride + i] * 16; | 187 temp_in[j] = input[j * stride + i] * 16; |
| 184 if (i == 0 && temp_in[0]) | 188 if (i == 0 && temp_in[0]) |
| 185 temp_in[0] += 1; | 189 temp_in[0] += 1; |
| 186 ht.cols(temp_in, temp_out); | 190 ht.cols(temp_in, temp_out); |
| 187 for (j = 0; j < 4; ++j) | 191 for (j = 0; j < 4; ++j) |
| 188 outptr[j * 4 + i] = temp_out[j]; | 192 outptr[j * 4 + i] = temp_out[j]; |
| 189 } | 193 } |
| 190 | 194 |
| 191 // Rows | 195 // Rows |
| 192 for (i = 0; i < 4; ++i) { | 196 for (i = 0; i < 4; ++i) { |
| 193 for (j = 0; j < 4; ++j) | 197 for (j = 0; j < 4; ++j) |
| 194 temp_in[j] = out[j + i * 4]; | 198 temp_in[j] = out[j + i * 4]; |
| 195 ht.rows(temp_in, temp_out); | 199 ht.rows(temp_in, temp_out); |
| 196 for (j = 0; j < 4; ++j) | 200 for (j = 0; j < 4; ++j) |
| 197 output[j + i * 4] = (temp_out[j] + 1) >> 2; | 201 output[j + i * 4] = (temp_out[j] + 1) >> 2; |
| 198 } | 202 } |
| 199 } | 203 } |
| 200 } | 204 } |
| 201 | 205 |
| 202 static void fdct8(const int16_t *input, int16_t *output) { | 206 static void fdct8(const tran_low_t *input, tran_low_t *output) { |
| 203 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 204 /*needs32*/ int t0, t1, t2, t3; | 208 tran_high_t t0, t1, t2, t3; // needs32 |
| 205 /*canbe16*/ int x0, x1, x2, x3; | 209 tran_high_t x0, x1, x2, x3; // canbe16 |
| 206 | 210 |
| 207 // stage 1 | 211 // stage 1 |
| 208 s0 = input[0] + input[7]; | 212 s0 = input[0] + input[7]; |
| 209 s1 = input[1] + input[6]; | 213 s1 = input[1] + input[6]; |
| 210 s2 = input[2] + input[5]; | 214 s2 = input[2] + input[5]; |
| 211 s3 = input[3] + input[4]; | 215 s3 = input[3] + input[4]; |
| 212 s4 = input[3] - input[4]; | 216 s4 = input[3] - input[4]; |
| 213 s5 = input[2] - input[5]; | 217 s5 = input[2] - input[5]; |
| 214 s6 = input[1] - input[6]; | 218 s6 = input[1] - input[6]; |
| 215 s7 = input[0] - input[7]; | 219 s7 = input[0] - input[7]; |
| (...skipping 28 matching lines...) Expand all Loading... |
| 244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; | 248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
| 245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; | 249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
| 246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
| 247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; | 251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
| 248 output[1] = fdct_round_shift(t0); | 252 output[1] = fdct_round_shift(t0); |
| 249 output[3] = fdct_round_shift(t2); | 253 output[3] = fdct_round_shift(t2); |
| 250 output[5] = fdct_round_shift(t1); | 254 output[5] = fdct_round_shift(t1); |
| 251 output[7] = fdct_round_shift(t3); | 255 output[7] = fdct_round_shift(t3); |
| 252 } | 256 } |
| 253 | 257 |
| 254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { | 258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { |
| 255 int r, c; | 259 int r, c; |
| 256 int16_t sum = 0; | 260 tran_low_t sum = 0; |
| 257 for (r = 0; r < 8; ++r) | 261 for (r = 0; r < 8; ++r) |
| 258 for (c = 0; c < 8; ++c) | 262 for (c = 0; c < 8; ++c) |
| 259 sum += input[r * stride + c]; | 263 sum += input[r * stride + c]; |
| 260 | 264 |
| 261 output[0] = sum; | 265 output[0] = sum; |
| 262 output[1] = 0; | 266 output[1] = 0; |
| 263 } | 267 } |
| 264 | 268 |
| 265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { | 269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { |
| 266 int i, j; | 270 int i, j; |
| 267 int16_t intermediate[64]; | 271 tran_low_t intermediate[64]; |
| 268 | 272 |
| 269 // Transform columns | 273 // Transform columns |
| 270 { | 274 { |
| 271 int16_t *output = intermediate; | 275 tran_low_t *output = intermediate; |
| 272 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 273 /*needs32*/ int t0, t1, t2, t3; | 277 tran_high_t t0, t1, t2, t3; // needs32 |
| 274 /*canbe16*/ int x0, x1, x2, x3; | 278 tran_high_t x0, x1, x2, x3; // canbe16 |
| 275 | 279 |
| 276 int i; | 280 int i; |
| 277 for (i = 0; i < 8; i++) { | 281 for (i = 0; i < 8; i++) { |
| 278 // stage 1 | 282 // stage 1 |
| 279 s0 = (input[0 * stride] + input[7 * stride]) * 4; | 283 s0 = (input[0 * stride] + input[7 * stride]) * 4; |
| 280 s1 = (input[1 * stride] + input[6 * stride]) * 4; | 284 s1 = (input[1 * stride] + input[6 * stride]) * 4; |
| 281 s2 = (input[2 * stride] + input[5 * stride]) * 4; | 285 s2 = (input[2 * stride] + input[5 * stride]) * 4; |
| 282 s3 = (input[3 * stride] + input[4 * stride]) * 4; | 286 s3 = (input[3 * stride] + input[4 * stride]) * 4; |
| 283 s4 = (input[3 * stride] - input[4 * stride]) * 4; | 287 s4 = (input[3 * stride] - input[4 * stride]) * 4; |
| 284 s5 = (input[2 * stride] - input[5 * stride]) * 4; | 288 s5 = (input[2 * stride] - input[5 * stride]) * 4; |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 326 } | 330 } |
| 327 | 331 |
| 328 // Rows | 332 // Rows |
| 329 for (i = 0; i < 8; ++i) { | 333 for (i = 0; i < 8; ++i) { |
| 330 fdct8(&intermediate[i * 8], &final_output[i * 8]); | 334 fdct8(&intermediate[i * 8], &final_output[i * 8]); |
| 331 for (j = 0; j < 8; ++j) | 335 for (j = 0; j < 8; ++j) |
| 332 final_output[j + i * 8] /= 2; | 336 final_output[j + i * 8] /= 2; |
| 333 } | 337 } |
| 334 } | 338 } |
| 335 | 339 |
| 336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { | 340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { |
| 337 int r, c; | 341 int r, c; |
| 338 int16_t sum = 0; | 342 tran_low_t sum = 0; |
| 339 for (r = 0; r < 16; ++r) | 343 for (r = 0; r < 16; ++r) |
| 340 for (c = 0; c < 16; ++c) | 344 for (c = 0; c < 16; ++c) |
| 341 sum += input[r * stride + c]; | 345 sum += input[r * stride + c]; |
| 342 | 346 |
| 343 output[0] = sum >> 1; | 347 output[0] = sum >> 1; |
| 344 output[1] = 0; | 348 output[1] = 0; |
| 345 } | 349 } |
| 346 | 350 |
| 347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { | 351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { |
| 348 // The 2D transform is done with two passes which are actually pretty | 352 // The 2D transform is done with two passes which are actually pretty |
| 349 // similar. In the first one, we transform the columns and transpose | 353 // similar. In the first one, we transform the columns and transpose |
| 350 // the results. In the second one, we transform the rows. To achieve that, | 354 // the results. In the second one, we transform the rows. To achieve that, |
| 351 // as the first pass results are transposed, we transpose the columns (that | 355 // as the first pass results are transposed, we transpose the columns (that |
| 352 // is the transposed rows) and transpose the results (so that it goes back | 356 // is the transposed rows) and transpose the results (so that it goes back |
| 353 // in normal/row positions). | 357 // in normal/row positions). |
| 354 int pass; | 358 int pass; |
| 355 // We need an intermediate buffer between passes. | 359 // We need an intermediate buffer between passes. |
| 356 int16_t intermediate[256]; | 360 tran_low_t intermediate[256]; |
| 357 const int16_t *in = input; | 361 const int16_t *in_pass0 = input; |
| 358 int16_t *out = intermediate; | 362 const tran_low_t *in = NULL; |
| 363 tran_low_t *out = intermediate; |
| 359 // Do the two transform/transpose passes | 364 // Do the two transform/transpose passes |
| 360 for (pass = 0; pass < 2; ++pass) { | 365 for (pass = 0; pass < 2; ++pass) { |
| 361 /*canbe16*/ int step1[8]; | 366 tran_high_t step1[8]; // canbe16 |
| 362 /*canbe16*/ int step2[8]; | 367 tran_high_t step2[8]; // canbe16 |
| 363 /*canbe16*/ int step3[8]; | 368 tran_high_t step3[8]; // canbe16 |
| 364 /*canbe16*/ int input[8]; | 369 tran_high_t input[8]; // canbe16 |
| 365 /*needs32*/ int temp1, temp2; | 370 tran_high_t temp1, temp2; // needs32 |
| 366 int i; | 371 int i; |
| 367 for (i = 0; i < 16; i++) { | 372 for (i = 0; i < 16; i++) { |
| 368 if (0 == pass) { | 373 if (0 == pass) { |
| 369 // Calculate input for the first 8 results. | 374 // Calculate input for the first 8 results. |
| 370 input[0] = (in[0 * stride] + in[15 * stride]) * 4; | 375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; |
| 371 input[1] = (in[1 * stride] + in[14 * stride]) * 4; | 376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; |
| 372 input[2] = (in[2 * stride] + in[13 * stride]) * 4; | 377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; |
| 373 input[3] = (in[3 * stride] + in[12 * stride]) * 4; | 378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; |
| 374 input[4] = (in[4 * stride] + in[11 * stride]) * 4; | 379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; |
| 375 input[5] = (in[5 * stride] + in[10 * stride]) * 4; | 380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; |
| 376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; | 381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; |
| 377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; | 382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; |
| 378 // Calculate input for the next 8 results. | 383 // Calculate input for the next 8 results. |
| 379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; | 384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; |
| 380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; | 385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; |
| 381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; | 386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; |
| 382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; | 387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; |
| 383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; | 388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; |
| 384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; | 389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; |
| 385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; | 390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; |
| 386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; | 391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; |
| 387 } else { | 392 } else { |
| 388 // Calculate input for the first 8 results. | 393 // Calculate input for the first 8 results. |
| 389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); |
| 390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); |
| 391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); |
| 392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); |
| 393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); |
| 394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); |
| 395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); |
| 396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); |
| 397 // Calculate input for the next 8 results. | 402 // Calculate input for the next 8 results. |
| 398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); |
| 399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); |
| 400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); |
| 401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); |
| 402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); |
| 403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); |
| 404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); |
| 405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); |
| 406 } | 411 } |
| 407 // Work on the first eight values; fdct8(input, even_results); | 412 // Work on the first eight values; fdct8(input, even_results); |
| 408 { | 413 { |
| 409 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 410 /*needs32*/ int t0, t1, t2, t3; | 415 tran_high_t t0, t1, t2, t3; // needs32 |
| 411 /*canbe16*/ int x0, x1, x2, x3; | 416 tran_high_t x0, x1, x2, x3; // canbe16 |
| 412 | 417 |
| 413 // stage 1 | 418 // stage 1 |
| 414 s0 = input[0] + input[7]; | 419 s0 = input[0] + input[7]; |
| 415 s1 = input[1] + input[6]; | 420 s1 = input[1] + input[6]; |
| 416 s2 = input[2] + input[5]; | 421 s2 = input[2] + input[5]; |
| 417 s3 = input[3] + input[4]; | 422 s3 = input[3] + input[4]; |
| 418 s4 = input[3] - input[4]; | 423 s4 = input[3] - input[4]; |
| 419 s5 = input[2] - input[5]; | 424 s5 = input[2] - input[5]; |
| 420 s6 = input[1] - input[6]; | 425 s6 = input[1] - input[6]; |
| 421 s7 = input[0] - input[7]; | 426 s7 = input[0] - input[7]; |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
| 508 out[3] = fdct_round_shift(temp1); | 513 out[3] = fdct_round_shift(temp1); |
| 509 out[11] = fdct_round_shift(temp2); | 514 out[11] = fdct_round_shift(temp2); |
| 510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
| 511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
| 512 out[7] = fdct_round_shift(temp1); | 517 out[7] = fdct_round_shift(temp1); |
| 513 out[15] = fdct_round_shift(temp2); | 518 out[15] = fdct_round_shift(temp2); |
| 514 } | 519 } |
| 515 // Do next column (which is a transposed row in second/horizontal pass) | 520 // Do next column (which is a transposed row in second/horizontal pass) |
| 516 in++; | 521 in++; |
| 522 in_pass0++; |
| 517 out += 16; | 523 out += 16; |
| 518 } | 524 } |
| 519 // Setup in/out for next pass. | 525 // Setup in/out for next pass. |
| 520 in = intermediate; | 526 in = intermediate; |
| 521 out = output; | 527 out = output; |
| 522 } | 528 } |
| 523 } | 529 } |
| 524 | 530 |
| 525 static void fadst8(const int16_t *input, int16_t *output) { | 531 static void fadst8(const tran_low_t *input, tran_low_t *output) { |
| 526 int s0, s1, s2, s3, s4, s5, s6, s7; | 532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
| 527 | 533 |
| 528 int x0 = input[7]; | 534 tran_high_t x0 = input[7]; |
| 529 int x1 = input[0]; | 535 tran_high_t x1 = input[0]; |
| 530 int x2 = input[5]; | 536 tran_high_t x2 = input[5]; |
| 531 int x3 = input[2]; | 537 tran_high_t x3 = input[2]; |
| 532 int x4 = input[3]; | 538 tran_high_t x4 = input[3]; |
| 533 int x5 = input[4]; | 539 tran_high_t x5 = input[4]; |
| 534 int x6 = input[1]; | 540 tran_high_t x6 = input[1]; |
| 535 int x7 = input[6]; | 541 tran_high_t x7 = input[6]; |
| 536 | 542 |
| 537 // stage 1 | 543 // stage 1 |
| 538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | 544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
| 539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | 545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
| 540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | 546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
| 541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | 547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
| 542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | 548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
| 543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | 549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
| 544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; | 550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
| 545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; | 551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 593 output[7] = - x1; | 599 output[7] = - x1; |
| 594 } | 600 } |
| 595 | 601 |
| 596 static const transform_2d FHT_8[] = { | 602 static const transform_2d FHT_8[] = { |
| 597 { fdct8, fdct8 }, // DCT_DCT = 0 | 603 { fdct8, fdct8 }, // DCT_DCT = 0 |
| 598 { fadst8, fdct8 }, // ADST_DCT = 1 | 604 { fadst8, fdct8 }, // ADST_DCT = 1 |
| 599 { fdct8, fadst8 }, // DCT_ADST = 2 | 605 { fdct8, fadst8 }, // DCT_ADST = 2 |
| 600 { fadst8, fadst8 } // ADST_ADST = 3 | 606 { fadst8, fadst8 } // ADST_ADST = 3 |
| 601 }; | 607 }; |
| 602 | 608 |
| 603 void vp9_fht8x8_c(const int16_t *input, int16_t *output, | 609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, |
| 604 int stride, int tx_type) { | 610 int stride, int tx_type) { |
| 605 if (tx_type == DCT_DCT) { | 611 if (tx_type == DCT_DCT) { |
| 606 vp9_fdct8x8_c(input, output, stride); | 612 vp9_fdct8x8_c(input, output, stride); |
| 607 } else { | 613 } else { |
| 608 int16_t out[64]; | 614 tran_low_t out[64]; |
| 609 int16_t *outptr = &out[0]; | 615 tran_low_t *outptr = &out[0]; |
| 610 int i, j; | 616 int i, j; |
| 611 int16_t temp_in[8], temp_out[8]; | 617 tran_low_t temp_in[8], temp_out[8]; |
| 612 const transform_2d ht = FHT_8[tx_type]; | 618 const transform_2d ht = FHT_8[tx_type]; |
| 613 | 619 |
| 614 // Columns | 620 // Columns |
| 615 for (i = 0; i < 8; ++i) { | 621 for (i = 0; i < 8; ++i) { |
| 616 for (j = 0; j < 8; ++j) | 622 for (j = 0; j < 8; ++j) |
| 617 temp_in[j] = input[j * stride + i] * 4; | 623 temp_in[j] = input[j * stride + i] * 4; |
| 618 ht.cols(temp_in, temp_out); | 624 ht.cols(temp_in, temp_out); |
| 619 for (j = 0; j < 8; ++j) | 625 for (j = 0; j < 8; ++j) |
| 620 outptr[j * 8 + i] = temp_out[j]; | 626 outptr[j * 8 + i] = temp_out[j]; |
| 621 } | 627 } |
| 622 | 628 |
| 623 // Rows | 629 // Rows |
| 624 for (i = 0; i < 8; ++i) { | 630 for (i = 0; i < 8; ++i) { |
| 625 for (j = 0; j < 8; ++j) | 631 for (j = 0; j < 8; ++j) |
| 626 temp_in[j] = out[j + i * 8]; | 632 temp_in[j] = out[j + i * 8]; |
| 627 ht.rows(temp_in, temp_out); | 633 ht.rows(temp_in, temp_out); |
| 628 for (j = 0; j < 8; ++j) | 634 for (j = 0; j < 8; ++j) |
| 629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
| 630 } | 636 } |
| 631 } | 637 } |
| 632 } | 638 } |
| 633 | 639 |
| 634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
| 635 pixel. */ | 641 pixel. */ |
| 636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { | 642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 637 int i; | 643 int i; |
| 638 int a1, b1, c1, d1, e1; | 644 tran_high_t a1, b1, c1, d1, e1; |
| 639 const int16_t *ip = input; | 645 const int16_t *ip_pass0 = input; |
| 640 int16_t *op = output; | 646 const tran_low_t *ip = NULL; |
| 647 tran_low_t *op = output; |
| 641 | 648 |
| 642 for (i = 0; i < 4; i++) { | 649 for (i = 0; i < 4; i++) { |
| 643 a1 = ip[0 * stride]; | 650 a1 = ip_pass0[0 * stride]; |
| 644 b1 = ip[1 * stride]; | 651 b1 = ip_pass0[1 * stride]; |
| 645 c1 = ip[2 * stride]; | 652 c1 = ip_pass0[2 * stride]; |
| 646 d1 = ip[3 * stride]; | 653 d1 = ip_pass0[3 * stride]; |
| 647 | 654 |
| 648 a1 += b1; | 655 a1 += b1; |
| 649 d1 = d1 - c1; | 656 d1 = d1 - c1; |
| 650 e1 = (a1 - d1) >> 1; | 657 e1 = (a1 - d1) >> 1; |
| 651 b1 = e1 - b1; | 658 b1 = e1 - b1; |
| 652 c1 = e1 - c1; | 659 c1 = e1 - c1; |
| 653 a1 -= c1; | 660 a1 -= c1; |
| 654 d1 += b1; | 661 d1 += b1; |
| 655 op[0] = a1; | 662 op[0] = a1; |
| 656 op[4] = c1; | 663 op[4] = c1; |
| 657 op[8] = d1; | 664 op[8] = d1; |
| 658 op[12] = b1; | 665 op[12] = b1; |
| 659 | 666 |
| 660 ip++; | 667 ip_pass0++; |
| 661 op++; | 668 op++; |
| 662 } | 669 } |
| 663 ip = output; | 670 ip = output; |
| 664 op = output; | 671 op = output; |
| 665 | 672 |
| 666 for (i = 0; i < 4; i++) { | 673 for (i = 0; i < 4; i++) { |
| 667 a1 = ip[0]; | 674 a1 = ip[0]; |
| 668 b1 = ip[1]; | 675 b1 = ip[1]; |
| 669 c1 = ip[2]; | 676 c1 = ip[2]; |
| 670 d1 = ip[3]; | 677 d1 = ip[3]; |
| 671 | 678 |
| 672 a1 += b1; | 679 a1 += b1; |
| 673 d1 -= c1; | 680 d1 -= c1; |
| 674 e1 = (a1 - d1) >> 1; | 681 e1 = (a1 - d1) >> 1; |
| 675 b1 = e1 - b1; | 682 b1 = e1 - b1; |
| 676 c1 = e1 - c1; | 683 c1 = e1 - c1; |
| 677 a1 -= c1; | 684 a1 -= c1; |
| 678 d1 += b1; | 685 d1 += b1; |
| 679 op[0] = a1 * UNIT_QUANT_FACTOR; | 686 op[0] = a1 * UNIT_QUANT_FACTOR; |
| 680 op[1] = c1 * UNIT_QUANT_FACTOR; | 687 op[1] = c1 * UNIT_QUANT_FACTOR; |
| 681 op[2] = d1 * UNIT_QUANT_FACTOR; | 688 op[2] = d1 * UNIT_QUANT_FACTOR; |
| 682 op[3] = b1 * UNIT_QUANT_FACTOR; | 689 op[3] = b1 * UNIT_QUANT_FACTOR; |
| 683 | 690 |
| 684 ip += 4; | 691 ip += 4; |
| 685 op += 4; | 692 op += 4; |
| 686 } | 693 } |
| 687 } | 694 } |
| 688 | 695 |
| 689 // Rewrote to use same algorithm as others. | 696 // Rewrote to use same algorithm as others. |
| 690 static void fdct16(const int16_t in[16], int16_t out[16]) { | 697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { |
| 691 /*canbe16*/ int step1[8]; | 698 tran_high_t step1[8]; // canbe16 |
| 692 /*canbe16*/ int step2[8]; | 699 tran_high_t step2[8]; // canbe16 |
| 693 /*canbe16*/ int step3[8]; | 700 tran_high_t step3[8]; // canbe16 |
| 694 /*canbe16*/ int input[8]; | 701 tran_high_t input[8]; // canbe16 |
| 695 /*needs32*/ int temp1, temp2; | 702 tran_high_t temp1, temp2; // needs32 |
| 696 | 703 |
| 697 // step 1 | 704 // step 1 |
| 698 input[0] = in[0] + in[15]; | 705 input[0] = in[0] + in[15]; |
| 699 input[1] = in[1] + in[14]; | 706 input[1] = in[1] + in[14]; |
| 700 input[2] = in[2] + in[13]; | 707 input[2] = in[2] + in[13]; |
| 701 input[3] = in[3] + in[12]; | 708 input[3] = in[3] + in[12]; |
| 702 input[4] = in[4] + in[11]; | 709 input[4] = in[4] + in[11]; |
| 703 input[5] = in[5] + in[10]; | 710 input[5] = in[5] + in[10]; |
| 704 input[6] = in[6] + in[ 9]; | 711 input[6] = in[6] + in[ 9]; |
| 705 input[7] = in[7] + in[ 8]; | 712 input[7] = in[7] + in[ 8]; |
| 706 | 713 |
| 707 step1[0] = in[7] - in[ 8]; | 714 step1[0] = in[7] - in[ 8]; |
| 708 step1[1] = in[6] - in[ 9]; | 715 step1[1] = in[6] - in[ 9]; |
| 709 step1[2] = in[5] - in[10]; | 716 step1[2] = in[5] - in[10]; |
| 710 step1[3] = in[4] - in[11]; | 717 step1[3] = in[4] - in[11]; |
| 711 step1[4] = in[3] - in[12]; | 718 step1[4] = in[3] - in[12]; |
| 712 step1[5] = in[2] - in[13]; | 719 step1[5] = in[2] - in[13]; |
| 713 step1[6] = in[1] - in[14]; | 720 step1[6] = in[1] - in[14]; |
| 714 step1[7] = in[0] - in[15]; | 721 step1[7] = in[0] - in[15]; |
| 715 | 722 |
| 716 // fdct8(step, step); | 723 // fdct8(step, step); |
| 717 { | 724 { |
| 718 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 719 /*needs32*/ int t0, t1, t2, t3; | 726 tran_high_t t0, t1, t2, t3; // needs32 |
| 720 /*canbe16*/ int x0, x1, x2, x3; | 727 tran_high_t x0, x1, x2, x3; // canbe16 |
| 721 | 728 |
| 722 // stage 1 | 729 // stage 1 |
| 723 s0 = input[0] + input[7]; | 730 s0 = input[0] + input[7]; |
| 724 s1 = input[1] + input[6]; | 731 s1 = input[1] + input[6]; |
| 725 s2 = input[2] + input[5]; | 732 s2 = input[2] + input[5]; |
| 726 s3 = input[3] + input[4]; | 733 s3 = input[3] + input[4]; |
| 727 s4 = input[3] - input[4]; | 734 s4 = input[3] - input[4]; |
| 728 s5 = input[2] - input[5]; | 735 s5 = input[2] - input[5]; |
| 729 s6 = input[1] - input[6]; | 736 s6 = input[1] - input[6]; |
| 730 s7 = input[0] - input[7]; | 737 s7 = input[0] - input[7]; |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
| 822 out[3] = fdct_round_shift(temp1); | 829 out[3] = fdct_round_shift(temp1); |
| 823 out[11] = fdct_round_shift(temp2); | 830 out[11] = fdct_round_shift(temp2); |
| 824 | 831 |
| 825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
| 826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
| 827 out[7] = fdct_round_shift(temp1); | 834 out[7] = fdct_round_shift(temp1); |
| 828 out[15] = fdct_round_shift(temp2); | 835 out[15] = fdct_round_shift(temp2); |
| 829 } | 836 } |
| 830 | 837 |
| 831 static void fadst16(const int16_t *input, int16_t *output) { | 838 static void fadst16(const tran_low_t *input, tran_low_t *output) { |
| 832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; |
| 840 tran_high_t s9, s10, s11, s12, s13, s14, s15; |
| 833 | 841 |
| 834 int x0 = input[15]; | 842 tran_high_t x0 = input[15]; |
| 835 int x1 = input[0]; | 843 tran_high_t x1 = input[0]; |
| 836 int x2 = input[13]; | 844 tran_high_t x2 = input[13]; |
| 837 int x3 = input[2]; | 845 tran_high_t x3 = input[2]; |
| 838 int x4 = input[11]; | 846 tran_high_t x4 = input[11]; |
| 839 int x5 = input[4]; | 847 tran_high_t x5 = input[4]; |
| 840 int x6 = input[9]; | 848 tran_high_t x6 = input[9]; |
| 841 int x7 = input[6]; | 849 tran_high_t x7 = input[6]; |
| 842 int x8 = input[7]; | 850 tran_high_t x8 = input[7]; |
| 843 int x9 = input[8]; | 851 tran_high_t x9 = input[8]; |
| 844 int x10 = input[5]; | 852 tran_high_t x10 = input[5]; |
| 845 int x11 = input[10]; | 853 tran_high_t x11 = input[10]; |
| 846 int x12 = input[3]; | 854 tran_high_t x12 = input[3]; |
| 847 int x13 = input[12]; | 855 tran_high_t x13 = input[12]; |
| 848 int x14 = input[1]; | 856 tran_high_t x14 = input[1]; |
| 849 int x15 = input[14]; | 857 tran_high_t x15 = input[14]; |
| 850 | 858 |
| 851 // stage 1 | 859 // stage 1 |
| 852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; | 860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; |
| 853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | 861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; |
| 854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; | 862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; |
| 855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | 863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; |
| 856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; | 864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; |
| 857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | 865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; |
| 858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | 866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; |
| 859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | 867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; |
| (...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 990 output[15] = - x1; | 998 output[15] = - x1; |
| 991 } | 999 } |
| 992 | 1000 |
| 993 static const transform_2d FHT_16[] = { | 1001 static const transform_2d FHT_16[] = { |
| 994 { fdct16, fdct16 }, // DCT_DCT = 0 | 1002 { fdct16, fdct16 }, // DCT_DCT = 0 |
| 995 { fadst16, fdct16 }, // ADST_DCT = 1 | 1003 { fadst16, fdct16 }, // ADST_DCT = 1 |
| 996 { fdct16, fadst16 }, // DCT_ADST = 2 | 1004 { fdct16, fadst16 }, // DCT_ADST = 2 |
| 997 { fadst16, fadst16 } // ADST_ADST = 3 | 1005 { fadst16, fadst16 } // ADST_ADST = 3 |
| 998 }; | 1006 }; |
| 999 | 1007 |
| 1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output, | 1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, |
| 1001 int stride, int tx_type) { | 1009 int stride, int tx_type) { |
| 1002 if (tx_type == DCT_DCT) { | 1010 if (tx_type == DCT_DCT) { |
| 1003 vp9_fdct16x16_c(input, output, stride); | 1011 vp9_fdct16x16_c(input, output, stride); |
| 1004 } else { | 1012 } else { |
| 1005 int16_t out[256]; | 1013 tran_low_t out[256]; |
| 1006 int16_t *outptr = &out[0]; | 1014 tran_low_t *outptr = &out[0]; |
| 1007 int i, j; | 1015 int i, j; |
| 1008 int16_t temp_in[16], temp_out[16]; | 1016 tran_low_t temp_in[16], temp_out[16]; |
| 1009 const transform_2d ht = FHT_16[tx_type]; | 1017 const transform_2d ht = FHT_16[tx_type]; |
| 1010 | 1018 |
| 1011 // Columns | 1019 // Columns |
| 1012 for (i = 0; i < 16; ++i) { | 1020 for (i = 0; i < 16; ++i) { |
| 1013 for (j = 0; j < 16; ++j) | 1021 for (j = 0; j < 16; ++j) |
| 1014 temp_in[j] = input[j * stride + i] * 4; | 1022 temp_in[j] = input[j * stride + i] * 4; |
| 1015 ht.cols(temp_in, temp_out); | 1023 ht.cols(temp_in, temp_out); |
| 1016 for (j = 0; j < 16; ++j) | 1024 for (j = 0; j < 16; ++j) |
| 1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
| 1018 } | 1026 } |
| 1019 | 1027 |
| 1020 // Rows | 1028 // Rows |
| 1021 for (i = 0; i < 16; ++i) { | 1029 for (i = 0; i < 16; ++i) { |
| 1022 for (j = 0; j < 16; ++j) | 1030 for (j = 0; j < 16; ++j) |
| 1023 temp_in[j] = out[j + i * 16]; | 1031 temp_in[j] = out[j + i * 16]; |
| 1024 ht.rows(temp_in, temp_out); | 1032 ht.rows(temp_in, temp_out); |
| 1025 for (j = 0; j < 16; ++j) | 1033 for (j = 0; j < 16; ++j) |
| 1026 output[j + i * 16] = temp_out[j]; | 1034 output[j + i * 16] = temp_out[j]; |
| 1027 } | 1035 } |
| 1028 } | 1036 } |
| 1029 } | 1037 } |
| 1030 | 1038 |
| 1031 static INLINE int dct_32_round(int input) { | 1039 static INLINE tran_high_t dct_32_round(tran_high_t input) { |
| 1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
| 1033 assert(-131072 <= rv && rv <= 131071); | 1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert, |
| 1042 // and make the bounds consts. |
| 1043 // assert(-131072 <= rv && rv <= 131071); |
| 1034 return rv; | 1044 return rv; |
| 1035 } | 1045 } |
| 1036 | 1046 |
| 1037 static INLINE int half_round_shift(int input) { | 1047 static INLINE tran_high_t half_round_shift(tran_high_t input) { |
| 1038 int rv = (input + 1 + (input < 0)) >> 2; | 1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2; |
| 1039 return rv; | 1049 return rv; |
| 1040 } | 1050 } |
| 1041 | 1051 |
| 1042 static void fdct32(const int *input, int *output, int round) { | 1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) { |
| 1043 int step[32]; | 1053 tran_high_t step[32]; |
| 1044 // Stage 1 | 1054 // Stage 1 |
| 1045 step[0] = input[0] + input[(32 - 1)]; | 1055 step[0] = input[0] + input[(32 - 1)]; |
| 1046 step[1] = input[1] + input[(32 - 2)]; | 1056 step[1] = input[1] + input[(32 - 2)]; |
| 1047 step[2] = input[2] + input[(32 - 3)]; | 1057 step[2] = input[2] + input[(32 - 3)]; |
| 1048 step[3] = input[3] + input[(32 - 4)]; | 1058 step[3] = input[3] + input[(32 - 4)]; |
| 1049 step[4] = input[4] + input[(32 - 5)]; | 1059 step[4] = input[4] + input[(32 - 5)]; |
| 1050 step[5] = input[5] + input[(32 - 6)]; | 1060 step[5] = input[5] + input[(32 - 6)]; |
| 1051 step[6] = input[6] + input[(32 - 7)]; | 1061 step[6] = input[6] + input[(32 - 7)]; |
| 1052 step[7] = input[7] + input[(32 - 8)]; | 1062 step[7] = input[7] + input[(32 - 8)]; |
| 1053 step[8] = input[8] + input[(32 - 9)]; | 1063 step[8] = input[8] + input[(32 - 9)]; |
| (...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); |
| 1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); |
| 1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); |
| 1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); |
| 1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); |
| 1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); |
| 1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); |
| 1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
| 1363 } | 1373 } |
| 1364 | 1374 |
| 1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { | 1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { |
| 1366 int r, c; | 1376 int r, c; |
| 1367 int16_t sum = 0; | 1377 tran_low_t sum = 0; |
| 1368 for (r = 0; r < 32; ++r) | 1378 for (r = 0; r < 32; ++r) |
| 1369 for (c = 0; c < 32; ++c) | 1379 for (c = 0; c < 32; ++c) |
| 1370 sum += input[r * stride + c]; | 1380 sum += input[r * stride + c]; |
| 1371 | 1381 |
| 1372 output[0] = sum >> 3; | 1382 output[0] = sum >> 3; |
| 1373 output[1] = 0; | 1383 output[1] = 0; |
| 1374 } | 1384 } |
| 1375 | 1385 |
| 1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { | 1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1377 int i, j; | 1387 int i, j; |
| 1378 int output[32 * 32]; | 1388 tran_high_t output[32 * 32]; |
| 1379 | 1389 |
| 1380 // Columns | 1390 // Columns |
| 1381 for (i = 0; i < 32; ++i) { | 1391 for (i = 0; i < 32; ++i) { |
| 1382 int temp_in[32], temp_out[32]; | 1392 tran_high_t temp_in[32], temp_out[32]; |
| 1383 for (j = 0; j < 32; ++j) | 1393 for (j = 0; j < 32; ++j) |
| 1384 temp_in[j] = input[j * stride + i] * 4; | 1394 temp_in[j] = input[j * stride + i] * 4; |
| 1385 fdct32(temp_in, temp_out, 0); | 1395 fdct32(temp_in, temp_out, 0); |
| 1386 for (j = 0; j < 32; ++j) | 1396 for (j = 0; j < 32; ++j) |
| 1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 1388 } | 1398 } |
| 1389 | 1399 |
| 1390 // Rows | 1400 // Rows |
| 1391 for (i = 0; i < 32; ++i) { | 1401 for (i = 0; i < 32; ++i) { |
| 1392 int temp_in[32], temp_out[32]; | 1402 tran_high_t temp_in[32], temp_out[32]; |
| 1393 for (j = 0; j < 32; ++j) | 1403 for (j = 0; j < 32; ++j) |
| 1394 temp_in[j] = output[j + i * 32]; | 1404 temp_in[j] = output[j + i * 32]; |
| 1395 fdct32(temp_in, temp_out, 0); | 1405 fdct32(temp_in, temp_out, 0); |
| 1396 for (j = 0; j < 32; ++j) | 1406 for (j = 0; j < 32; ++j) |
| 1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
| 1398 } | 1408 } |
| 1399 } | 1409 } |
| 1400 | 1410 |
| 1401 // Note that although we use dct_32_round in dct32 computation flow, | 1411 // Note that although we use dct_32_round in dct32 computation flow, |
| 1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating |
| 1403 // within 16 bits precision. | 1413 // within 16 bits precision. |
| 1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { | 1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1405 int i, j; | 1415 int i, j; |
| 1406 int output[32 * 32]; | 1416 tran_high_t output[32 * 32]; |
| 1407 | 1417 |
| 1408 // Columns | 1418 // Columns |
| 1409 for (i = 0; i < 32; ++i) { | 1419 for (i = 0; i < 32; ++i) { |
| 1410 int temp_in[32], temp_out[32]; | 1420 tran_high_t temp_in[32], temp_out[32]; |
| 1411 for (j = 0; j < 32; ++j) | 1421 for (j = 0; j < 32; ++j) |
| 1412 temp_in[j] = input[j * stride + i] * 4; | 1422 temp_in[j] = input[j * stride + i] * 4; |
| 1413 fdct32(temp_in, temp_out, 0); | 1423 fdct32(temp_in, temp_out, 0); |
| 1414 for (j = 0; j < 32; ++j) | 1424 for (j = 0; j < 32; ++j) |
| 1415 // TODO(cd): see quality impact of only doing | 1425 // TODO(cd): see quality impact of only doing |
| 1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; |
| 1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c |
| 1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 1419 } | 1429 } |
| 1420 | 1430 |
| 1421 // Rows | 1431 // Rows |
| 1422 for (i = 0; i < 32; ++i) { | 1432 for (i = 0; i < 32; ++i) { |
| 1423 int temp_in[32], temp_out[32]; | 1433 tran_high_t temp_in[32], temp_out[32]; |
| 1424 for (j = 0; j < 32; ++j) | 1434 for (j = 0; j < 32; ++j) |
| 1425 temp_in[j] = output[j + i * 32]; | 1435 temp_in[j] = output[j + i * 32]; |
| 1426 fdct32(temp_in, temp_out, 1); | 1436 fdct32(temp_in, temp_out, 1); |
| 1427 for (j = 0; j < 32; ++j) | 1437 for (j = 0; j < 32; ++j) |
| 1428 out[j + i * 32] = temp_out[j]; | 1438 out[j + i * 32] = temp_out[j]; |
| 1429 } | 1439 } |
| 1430 } | 1440 } |
| 1441 |
| 1442 #if CONFIG_VP9_HIGHBITDEPTH |
| 1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 1444 vp9_fdct4x4_c(input, output, stride); |
| 1445 } |
| 1446 |
| 1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output, |
| 1448 int stride, int tx_type) { |
| 1449 vp9_fht4x4_c(input, output, stride, tx_type); |
| 1450 } |
| 1451 |
| 1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, |
| 1453 int stride) { |
| 1454 vp9_fdct8x8_1_c(input, final_output, stride); |
| 1455 } |
| 1456 |
| 1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output, |
| 1458 int stride) { |
| 1459 vp9_fdct8x8_c(input, final_output, stride); |
| 1460 } |
| 1461 |
| 1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output, |
| 1463 int stride) { |
| 1464 vp9_fdct16x16_1_c(input, output, stride); |
| 1465 } |
| 1466 |
| 1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output, |
| 1468 int stride) { |
| 1469 vp9_fdct16x16_c(input, output, stride); |
| 1470 } |
| 1471 |
| 1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output, |
| 1473 int stride, int tx_type) { |
| 1474 vp9_fht8x8_c(input, output, stride, tx_type); |
| 1475 } |
| 1476 |
| 1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 1478 vp9_fwht4x4_c(input, output, stride); |
| 1479 } |
| 1480 |
| 1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output, |
| 1482 int stride, int tx_type) { |
| 1483 vp9_fht16x16_c(input, output, stride, tx_type); |
| 1484 } |
| 1485 |
| 1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1487 vp9_fdct32x32_1_c(input, out, stride); |
| 1488 } |
| 1489 |
| 1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1491 vp9_fdct32x32_c(input, out, stride); |
| 1492 } |
| 1493 |
| 1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, |
| 1495 int stride) { |
| 1496 vp9_fdct32x32_rd_c(input, out, stride); |
| 1497 } |
| 1498 #endif // CONFIG_VP9_HIGHBITDEPTH |
| OLD | NEW |