| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 | |
| 12 #include <assert.h> | 11 #include <assert.h> |
| 13 #include <math.h> | 12 #include <math.h> |
| 13 |
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 15 #include "vp9/common/vp9_systemdependent.h" | 15 #include "./vp9_rtcd.h" |
| 16 | 16 |
| 17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
| 18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
| 19 #include "vp9/common/vp9_systemdependent.h" |
| 19 | 20 |
| 20 static void fdct4_1d(int16_t *input, int16_t *output) { | 21 #include "vp9/encoder/vp9_dct.h" |
| 22 |
| 23 static void fdct4(const int16_t *input, int16_t *output) { |
| 21 int16_t step[4]; | 24 int16_t step[4]; |
| 22 int temp1, temp2; | 25 int temp1, temp2; |
| 23 | 26 |
| 24 step[0] = input[0] + input[3]; | 27 step[0] = input[0] + input[3]; |
| 25 step[1] = input[1] + input[2]; | 28 step[1] = input[1] + input[2]; |
| 26 step[2] = input[1] - input[2]; | 29 step[2] = input[1] - input[2]; |
| 27 step[3] = input[0] - input[3]; | 30 step[3] = input[0] - input[3]; |
| 28 | 31 |
| 29 temp1 = (step[0] + step[1]) * cospi_16_64; | 32 temp1 = (step[0] + step[1]) * cospi_16_64; |
| 30 temp2 = (step[0] - step[1]) * cospi_16_64; | 33 temp2 = (step[0] - step[1]) * cospi_16_64; |
| 31 output[0] = dct_const_round_shift(temp1); | 34 output[0] = dct_const_round_shift(temp1); |
| 32 output[2] = dct_const_round_shift(temp2); | 35 output[2] = dct_const_round_shift(temp2); |
| 33 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 36 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
| 34 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 37 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
| 35 output[1] = dct_const_round_shift(temp1); | 38 output[1] = dct_const_round_shift(temp1); |
| 36 output[3] = dct_const_round_shift(temp2); | 39 output[3] = dct_const_round_shift(temp2); |
| 37 } | 40 } |
| 38 | 41 |
| 39 void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { | 42 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { |
| 40 // The 2D transform is done with two passes which are actually pretty | 43 // The 2D transform is done with two passes which are actually pretty |
| 41 // similar. In the first one, we transform the columns and transpose | 44 // similar. In the first one, we transform the columns and transpose |
| 42 // the results. In the second one, we transform the rows. To achieve that, | 45 // the results. In the second one, we transform the rows. To achieve that, |
| 43 // as the first pass results are transposed, we tranpose the columns (that | 46 // as the first pass results are transposed, we tranpose the columns (that |
| 44 // is the transposed rows) and transpose the results (so that it goes back | 47 // is the transposed rows) and transpose the results (so that it goes back |
| 45 // in normal/row positions). | 48 // in normal/row positions). |
| 46 const int stride = pitch >> 1; | |
| 47 int pass; | 49 int pass; |
| 48 // We need an intermediate buffer between passes. | 50 // We need an intermediate buffer between passes. |
| 49 int16_t intermediate[4 * 4]; | 51 int16_t intermediate[4 * 4]; |
| 50 int16_t *in = input; | 52 const int16_t *in = input; |
| 51 int16_t *out = intermediate; | 53 int16_t *out = intermediate; |
| 52 // Do the two transform/transpose passes | 54 // Do the two transform/transpose passes |
| 53 for (pass = 0; pass < 2; ++pass) { | 55 for (pass = 0; pass < 2; ++pass) { |
| 54 /*canbe16*/ int input[4]; | 56 /*canbe16*/ int input[4]; |
| 55 /*canbe16*/ int step[4]; | 57 /*canbe16*/ int step[4]; |
| 56 /*needs32*/ int temp1, temp2; | 58 /*needs32*/ int temp1, temp2; |
| 57 int i; | 59 int i; |
| 58 for (i = 0; i < 4; ++i) { | 60 for (i = 0; i < 4; ++i) { |
| 59 // Load inputs. | 61 // Load inputs. |
| 60 if (0 == pass) { | 62 if (0 == pass) { |
| 61 input[0] = in[0 * stride] << 4; | 63 input[0] = in[0 * stride] * 16; |
| 62 input[1] = in[1 * stride] << 4; | 64 input[1] = in[1 * stride] * 16; |
| 63 input[2] = in[2 * stride] << 4; | 65 input[2] = in[2 * stride] * 16; |
| 64 input[3] = in[3 * stride] << 4; | 66 input[3] = in[3 * stride] * 16; |
| 65 if (i == 0 && input[0]) { | 67 if (i == 0 && input[0]) { |
| 66 input[0] += 1; | 68 input[0] += 1; |
| 67 } | 69 } |
| 68 } else { | 70 } else { |
| 69 input[0] = in[0 * 4]; | 71 input[0] = in[0 * 4]; |
| 70 input[1] = in[1 * 4]; | 72 input[1] = in[1 * 4]; |
| 71 input[2] = in[2 * 4]; | 73 input[2] = in[2 * 4]; |
| 72 input[3] = in[3 * 4]; | 74 input[3] = in[3 * 4]; |
| 73 } | 75 } |
| 74 // Transform. | 76 // Transform. |
| (...skipping 20 matching lines...) Expand all Loading... |
| 95 | 97 |
| 96 { | 98 { |
| 97 int i, j; | 99 int i, j; |
| 98 for (i = 0; i < 4; ++i) { | 100 for (i = 0; i < 4; ++i) { |
| 99 for (j = 0; j < 4; ++j) | 101 for (j = 0; j < 4; ++j) |
| 100 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 102 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; |
| 101 } | 103 } |
| 102 } | 104 } |
| 103 } | 105 } |
| 104 | 106 |
| 105 static void fadst4_1d(int16_t *input, int16_t *output) { | 107 static void fadst4(const int16_t *input, int16_t *output) { |
| 106 int x0, x1, x2, x3; | 108 int x0, x1, x2, x3; |
| 107 int s0, s1, s2, s3, s4, s5, s6, s7; | 109 int s0, s1, s2, s3, s4, s5, s6, s7; |
| 108 | 110 |
| 109 x0 = input[0]; | 111 x0 = input[0]; |
| 110 x1 = input[1]; | 112 x1 = input[1]; |
| 111 x2 = input[2]; | 113 x2 = input[2]; |
| 112 x3 = input[3]; | 114 x3 = input[3]; |
| 113 | 115 |
| 114 if (!(x0 | x1 | x2 | x3)) { | 116 if (!(x0 | x1 | x2 | x3)) { |
| 115 output[0] = output[1] = output[2] = output[3] = 0; | 117 output[0] = output[1] = output[2] = output[3] = 0; |
| (...skipping 20 matching lines...) Expand all Loading... |
| 136 s3 = x2 - x0 + x3; | 138 s3 = x2 - x0 + x3; |
| 137 | 139 |
| 138 // 1-D transform scaling factor is sqrt(2). | 140 // 1-D transform scaling factor is sqrt(2). |
| 139 output[0] = dct_const_round_shift(s0); | 141 output[0] = dct_const_round_shift(s0); |
| 140 output[1] = dct_const_round_shift(s1); | 142 output[1] = dct_const_round_shift(s1); |
| 141 output[2] = dct_const_round_shift(s2); | 143 output[2] = dct_const_round_shift(s2); |
| 142 output[3] = dct_const_round_shift(s3); | 144 output[3] = dct_const_round_shift(s3); |
| 143 } | 145 } |
| 144 | 146 |
| 145 static const transform_2d FHT_4[] = { | 147 static const transform_2d FHT_4[] = { |
| 146 { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 | 148 { fdct4, fdct4 }, // DCT_DCT = 0 |
| 147 { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 | 149 { fadst4, fdct4 }, // ADST_DCT = 1 |
| 148 { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 | 150 { fdct4, fadst4 }, // DCT_ADST = 2 |
| 149 { fadst4_1d, fadst4_1d } // ADST_ADST = 3 | 151 { fadst4, fadst4 } // ADST_ADST = 3 |
| 150 }; | 152 }; |
| 151 | 153 |
| 152 void vp9_short_fht4x4_c(int16_t *input, int16_t *output, | 154 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, |
| 153 int pitch, TX_TYPE tx_type) { | 155 int stride, int tx_type) { |
| 154 int16_t out[4 * 4]; | 156 int16_t out[4 * 4]; |
| 155 int16_t *outptr = &out[0]; | 157 int16_t *outptr = &out[0]; |
| 156 int i, j; | 158 int i, j; |
| 157 int16_t temp_in[4], temp_out[4]; | 159 int16_t temp_in[4], temp_out[4]; |
| 158 const transform_2d ht = FHT_4[tx_type]; | 160 const transform_2d ht = FHT_4[tx_type]; |
| 159 | 161 |
| 160 // Columns | 162 // Columns |
| 161 for (i = 0; i < 4; ++i) { | 163 for (i = 0; i < 4; ++i) { |
| 162 for (j = 0; j < 4; ++j) | 164 for (j = 0; j < 4; ++j) |
| 163 temp_in[j] = input[j * pitch + i] << 4; | 165 temp_in[j] = input[j * stride + i] * 16; |
| 164 if (i == 0 && temp_in[0]) | 166 if (i == 0 && temp_in[0]) |
| 165 temp_in[0] += 1; | 167 temp_in[0] += 1; |
| 166 ht.cols(temp_in, temp_out); | 168 ht.cols(temp_in, temp_out); |
| 167 for (j = 0; j < 4; ++j) | 169 for (j = 0; j < 4; ++j) |
| 168 outptr[j * 4 + i] = temp_out[j]; | 170 outptr[j * 4 + i] = temp_out[j]; |
| 169 } | 171 } |
| 170 | 172 |
| 171 // Rows | 173 // Rows |
| 172 for (i = 0; i < 4; ++i) { | 174 for (i = 0; i < 4; ++i) { |
| 173 for (j = 0; j < 4; ++j) | 175 for (j = 0; j < 4; ++j) |
| 174 temp_in[j] = out[j + i * 4]; | 176 temp_in[j] = out[j + i * 4]; |
| 175 ht.rows(temp_in, temp_out); | 177 ht.rows(temp_in, temp_out); |
| 176 for (j = 0; j < 4; ++j) | 178 for (j = 0; j < 4; ++j) |
| 177 output[j + i * 4] = (temp_out[j] + 1) >> 2; | 179 output[j + i * 4] = (temp_out[j] + 1) >> 2; |
| 178 } | 180 } |
| 179 } | 181 } |
| 180 | 182 |
| 181 void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { | 183 static void fdct8(const int16_t *input, int16_t *output) { |
| 182 vp9_short_fdct4x4_c(input, output, pitch); | |
| 183 vp9_short_fdct4x4_c(input + 4, output + 16, pitch); | |
| 184 } | |
| 185 | |
| 186 static void fdct8_1d(int16_t *input, int16_t *output) { | |
| 187 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 184 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
| 188 /*needs32*/ int t0, t1, t2, t3; | 185 /*needs32*/ int t0, t1, t2, t3; |
| 189 /*canbe16*/ int x0, x1, x2, x3; | 186 /*canbe16*/ int x0, x1, x2, x3; |
| 190 | 187 |
| 191 // stage 1 | 188 // stage 1 |
| 192 s0 = input[0] + input[7]; | 189 s0 = input[0] + input[7]; |
| 193 s1 = input[1] + input[6]; | 190 s1 = input[1] + input[6]; |
| 194 s2 = input[2] + input[5]; | 191 s2 = input[2] + input[5]; |
| 195 s3 = input[3] + input[4]; | 192 s3 = input[3] + input[4]; |
| 196 s4 = input[3] - input[4]; | 193 s4 = input[3] - input[4]; |
| 197 s5 = input[2] - input[5]; | 194 s5 = input[2] - input[5]; |
| 198 s6 = input[1] - input[6]; | 195 s6 = input[1] - input[6]; |
| 199 s7 = input[0] - input[7]; | 196 s7 = input[0] - input[7]; |
| 200 | 197 |
| 201 // fdct4_1d(step, step); | 198 // fdct4(step, step); |
| 202 x0 = s0 + s3; | 199 x0 = s0 + s3; |
| 203 x1 = s1 + s2; | 200 x1 = s1 + s2; |
| 204 x2 = s1 - s2; | 201 x2 = s1 - s2; |
| 205 x3 = s0 - s3; | 202 x3 = s0 - s3; |
| 206 t0 = (x0 + x1) * cospi_16_64; | 203 t0 = (x0 + x1) * cospi_16_64; |
| 207 t1 = (x0 - x1) * cospi_16_64; | 204 t1 = (x0 - x1) * cospi_16_64; |
| 208 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; | 205 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
| 209 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; | 206 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
| 210 output[0] = dct_const_round_shift(t0); | 207 output[0] = dct_const_round_shift(t0); |
| 211 output[2] = dct_const_round_shift(t2); | 208 output[2] = dct_const_round_shift(t2); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 228 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; | 225 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
| 229 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; | 226 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
| 230 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 227 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
| 231 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; | 228 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
| 232 output[1] = dct_const_round_shift(t0); | 229 output[1] = dct_const_round_shift(t0); |
| 233 output[3] = dct_const_round_shift(t2); | 230 output[3] = dct_const_round_shift(t2); |
| 234 output[5] = dct_const_round_shift(t1); | 231 output[5] = dct_const_round_shift(t1); |
| 235 output[7] = dct_const_round_shift(t3); | 232 output[7] = dct_const_round_shift(t3); |
| 236 } | 233 } |
| 237 | 234 |
| 238 void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { | 235 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { |
| 239 const int stride = pitch >> 1; | |
| 240 int i, j; | 236 int i, j; |
| 241 int16_t intermediate[64]; | 237 int16_t intermediate[64]; |
| 242 | 238 |
| 243 // Transform columns | 239 // Transform columns |
| 244 { | 240 { |
| 245 int16_t *output = intermediate; | 241 int16_t *output = intermediate; |
| 246 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 242 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
| 247 /*needs32*/ int t0, t1, t2, t3; | 243 /*needs32*/ int t0, t1, t2, t3; |
| 248 /*canbe16*/ int x0, x1, x2, x3; | 244 /*canbe16*/ int x0, x1, x2, x3; |
| 249 | 245 |
| 250 int i; | 246 int i; |
| 251 for (i = 0; i < 8; i++) { | 247 for (i = 0; i < 8; i++) { |
| 252 // stage 1 | 248 // stage 1 |
| 253 s0 = (input[0 * stride] + input[7 * stride]) << 2; | 249 s0 = (input[0 * stride] + input[7 * stride]) * 4; |
| 254 s1 = (input[1 * stride] + input[6 * stride]) << 2; | 250 s1 = (input[1 * stride] + input[6 * stride]) * 4; |
| 255 s2 = (input[2 * stride] + input[5 * stride]) << 2; | 251 s2 = (input[2 * stride] + input[5 * stride]) * 4; |
| 256 s3 = (input[3 * stride] + input[4 * stride]) << 2; | 252 s3 = (input[3 * stride] + input[4 * stride]) * 4; |
| 257 s4 = (input[3 * stride] - input[4 * stride]) << 2; | 253 s4 = (input[3 * stride] - input[4 * stride]) * 4; |
| 258 s5 = (input[2 * stride] - input[5 * stride]) << 2; | 254 s5 = (input[2 * stride] - input[5 * stride]) * 4; |
| 259 s6 = (input[1 * stride] - input[6 * stride]) << 2; | 255 s6 = (input[1 * stride] - input[6 * stride]) * 4; |
| 260 s7 = (input[0 * stride] - input[7 * stride]) << 2; | 256 s7 = (input[0 * stride] - input[7 * stride]) * 4; |
| 261 | 257 |
| 262 // fdct4_1d(step, step); | 258 // fdct4(step, step); |
| 263 x0 = s0 + s3; | 259 x0 = s0 + s3; |
| 264 x1 = s1 + s2; | 260 x1 = s1 + s2; |
| 265 x2 = s1 - s2; | 261 x2 = s1 - s2; |
| 266 x3 = s0 - s3; | 262 x3 = s0 - s3; |
| 267 t0 = (x0 + x1) * cospi_16_64; | 263 t0 = (x0 + x1) * cospi_16_64; |
| 268 t1 = (x0 - x1) * cospi_16_64; | 264 t1 = (x0 - x1) * cospi_16_64; |
| 269 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; | 265 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
| 270 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; | 266 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
| 271 output[0 * 8] = dct_const_round_shift(t0); | 267 output[0 * 8] = dct_const_round_shift(t0); |
| 272 output[2 * 8] = dct_const_round_shift(t2); | 268 output[2 * 8] = dct_const_round_shift(t2); |
| (...skipping 21 matching lines...) Expand all Loading... |
| 294 output[3 * 8] = dct_const_round_shift(t2); | 290 output[3 * 8] = dct_const_round_shift(t2); |
| 295 output[5 * 8] = dct_const_round_shift(t1); | 291 output[5 * 8] = dct_const_round_shift(t1); |
| 296 output[7 * 8] = dct_const_round_shift(t3); | 292 output[7 * 8] = dct_const_round_shift(t3); |
| 297 input++; | 293 input++; |
| 298 output++; | 294 output++; |
| 299 } | 295 } |
| 300 } | 296 } |
| 301 | 297 |
| 302 // Rows | 298 // Rows |
| 303 for (i = 0; i < 8; ++i) { | 299 for (i = 0; i < 8; ++i) { |
| 304 fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); | 300 fdct8(&intermediate[i * 8], &final_output[i * 8]); |
| 305 for (j = 0; j < 8; ++j) | 301 for (j = 0; j < 8; ++j) |
| 306 final_output[j + i * 8] /= 2; | 302 final_output[j + i * 8] /= 2; |
| 307 } | 303 } |
| 308 } | 304 } |
| 309 | 305 |
| 310 void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { | 306 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { |
| 311 // The 2D transform is done with two passes which are actually pretty | 307 // The 2D transform is done with two passes which are actually pretty |
| 312 // similar. In the first one, we transform the columns and transpose | 308 // similar. In the first one, we transform the columns and transpose |
| 313 // the results. In the second one, we transform the rows. To achieve that, | 309 // the results. In the second one, we transform the rows. To achieve that, |
| 314 // as the first pass results are transposed, we tranpose the columns (that | 310 // as the first pass results are transposed, we tranpose the columns (that |
| 315 // is the transposed rows) and transpose the results (so that it goes back | 311 // is the transposed rows) and transpose the results (so that it goes back |
| 316 // in normal/row positions). | 312 // in normal/row positions). |
| 317 const int stride = pitch >> 1; | |
| 318 int pass; | 313 int pass; |
| 319 // We need an intermediate buffer between passes. | 314 // We need an intermediate buffer between passes. |
| 320 int16_t intermediate[256]; | 315 int16_t intermediate[256]; |
| 321 int16_t *in = input; | 316 const int16_t *in = input; |
| 322 int16_t *out = intermediate; | 317 int16_t *out = intermediate; |
| 323 // Do the two transform/transpose passes | 318 // Do the two transform/transpose passes |
| 324 for (pass = 0; pass < 2; ++pass) { | 319 for (pass = 0; pass < 2; ++pass) { |
| 325 /*canbe16*/ int step1[8]; | 320 /*canbe16*/ int step1[8]; |
| 326 /*canbe16*/ int step2[8]; | 321 /*canbe16*/ int step2[8]; |
| 327 /*canbe16*/ int step3[8]; | 322 /*canbe16*/ int step3[8]; |
| 328 /*canbe16*/ int input[8]; | 323 /*canbe16*/ int input[8]; |
| 329 /*needs32*/ int temp1, temp2; | 324 /*needs32*/ int temp1, temp2; |
| 330 int i; | 325 int i; |
| 331 for (i = 0; i < 16; i++) { | 326 for (i = 0; i < 16; i++) { |
| 332 if (0 == pass) { | 327 if (0 == pass) { |
| 333 // Calculate input for the first 8 results. | 328 // Calculate input for the first 8 results. |
| 334 input[0] = (in[0 * stride] + in[15 * stride]) << 2; | 329 input[0] = (in[0 * stride] + in[15 * stride]) * 4; |
| 335 input[1] = (in[1 * stride] + in[14 * stride]) << 2; | 330 input[1] = (in[1 * stride] + in[14 * stride]) * 4; |
| 336 input[2] = (in[2 * stride] + in[13 * stride]) << 2; | 331 input[2] = (in[2 * stride] + in[13 * stride]) * 4; |
| 337 input[3] = (in[3 * stride] + in[12 * stride]) << 2; | 332 input[3] = (in[3 * stride] + in[12 * stride]) * 4; |
| 338 input[4] = (in[4 * stride] + in[11 * stride]) << 2; | 333 input[4] = (in[4 * stride] + in[11 * stride]) * 4; |
| 339 input[5] = (in[5 * stride] + in[10 * stride]) << 2; | 334 input[5] = (in[5 * stride] + in[10 * stride]) * 4; |
| 340 input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; | 335 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; |
| 341 input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; | 336 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; |
| 342 // Calculate input for the next 8 results. | 337 // Calculate input for the next 8 results. |
| 343 step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; | 338 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; |
| 344 step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; | 339 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; |
| 345 step1[2] = (in[5 * stride] - in[10 * stride]) << 2; | 340 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; |
| 346 step1[3] = (in[4 * stride] - in[11 * stride]) << 2; | 341 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; |
| 347 step1[4] = (in[3 * stride] - in[12 * stride]) << 2; | 342 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; |
| 348 step1[5] = (in[2 * stride] - in[13 * stride]) << 2; | 343 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; |
| 349 step1[6] = (in[1 * stride] - in[14 * stride]) << 2; | 344 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; |
| 350 step1[7] = (in[0 * stride] - in[15 * stride]) << 2; | 345 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; |
| 351 } else { | 346 } else { |
| 352 // Calculate input for the first 8 results. | 347 // Calculate input for the first 8 results. |
| 353 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 348 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); |
| 354 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 349 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); |
| 355 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 350 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); |
| 356 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 351 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); |
| 357 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 352 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); |
| 358 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 353 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); |
| 359 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 354 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); |
| 360 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 355 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); |
| 361 // Calculate input for the next 8 results. | 356 // Calculate input for the next 8 results. |
| 362 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 357 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); |
| 363 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 358 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); |
| 364 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 359 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); |
| 365 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 360 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); |
| 366 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 361 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); |
| 367 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 362 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); |
| 368 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 363 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); |
| 369 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 364 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); |
| 370 } | 365 } |
| 371 // Work on the first eight values; fdct8_1d(input, even_results); | 366 // Work on the first eight values; fdct8(input, even_results); |
| 372 { | 367 { |
| 373 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 368 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
| 374 /*needs32*/ int t0, t1, t2, t3; | 369 /*needs32*/ int t0, t1, t2, t3; |
| 375 /*canbe16*/ int x0, x1, x2, x3; | 370 /*canbe16*/ int x0, x1, x2, x3; |
| 376 | 371 |
| 377 // stage 1 | 372 // stage 1 |
| 378 s0 = input[0] + input[7]; | 373 s0 = input[0] + input[7]; |
| 379 s1 = input[1] + input[6]; | 374 s1 = input[1] + input[6]; |
| 380 s2 = input[2] + input[5]; | 375 s2 = input[2] + input[5]; |
| 381 s3 = input[3] + input[4]; | 376 s3 = input[3] + input[4]; |
| 382 s4 = input[3] - input[4]; | 377 s4 = input[3] - input[4]; |
| 383 s5 = input[2] - input[5]; | 378 s5 = input[2] - input[5]; |
| 384 s6 = input[1] - input[6]; | 379 s6 = input[1] - input[6]; |
| 385 s7 = input[0] - input[7]; | 380 s7 = input[0] - input[7]; |
| 386 | 381 |
| 387 // fdct4_1d(step, step); | 382 // fdct4(step, step); |
| 388 x0 = s0 + s3; | 383 x0 = s0 + s3; |
| 389 x1 = s1 + s2; | 384 x1 = s1 + s2; |
| 390 x2 = s1 - s2; | 385 x2 = s1 - s2; |
| 391 x3 = s0 - s3; | 386 x3 = s0 - s3; |
| 392 t0 = (x0 + x1) * cospi_16_64; | 387 t0 = (x0 + x1) * cospi_16_64; |
| 393 t1 = (x0 - x1) * cospi_16_64; | 388 t1 = (x0 - x1) * cospi_16_64; |
| 394 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; | 389 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
| 395 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; | 390 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
| 396 out[0] = dct_const_round_shift(t0); | 391 out[0] = dct_const_round_shift(t0); |
| 397 out[4] = dct_const_round_shift(t2); | 392 out[4] = dct_const_round_shift(t2); |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 479 // Do next column (which is a transposed row in second/horizontal pass) | 474 // Do next column (which is a transposed row in second/horizontal pass) |
| 480 in++; | 475 in++; |
| 481 out += 16; | 476 out += 16; |
| 482 } | 477 } |
| 483 // Setup in/out for next pass. | 478 // Setup in/out for next pass. |
| 484 in = intermediate; | 479 in = intermediate; |
| 485 out = output; | 480 out = output; |
| 486 } | 481 } |
| 487 } | 482 } |
| 488 | 483 |
| 489 static void fadst8_1d(int16_t *input, int16_t *output) { | 484 static void fadst8(const int16_t *input, int16_t *output) { |
| 490 int s0, s1, s2, s3, s4, s5, s6, s7; | 485 int s0, s1, s2, s3, s4, s5, s6, s7; |
| 491 | 486 |
| 492 int x0 = input[7]; | 487 int x0 = input[7]; |
| 493 int x1 = input[0]; | 488 int x1 = input[0]; |
| 494 int x2 = input[5]; | 489 int x2 = input[5]; |
| 495 int x3 = input[2]; | 490 int x3 = input[2]; |
| 496 int x4 = input[3]; | 491 int x4 = input[3]; |
| 497 int x5 = input[4]; | 492 int x5 = input[4]; |
| 498 int x6 = input[1]; | 493 int x6 = input[1]; |
| 499 int x7 = input[6]; | 494 int x7 = input[6]; |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 551 output[1] = - x4; | 546 output[1] = - x4; |
| 552 output[2] = x6; | 547 output[2] = x6; |
| 553 output[3] = - x2; | 548 output[3] = - x2; |
| 554 output[4] = x3; | 549 output[4] = x3; |
| 555 output[5] = - x7; | 550 output[5] = - x7; |
| 556 output[6] = x5; | 551 output[6] = x5; |
| 557 output[7] = - x1; | 552 output[7] = - x1; |
| 558 } | 553 } |
| 559 | 554 |
| 560 static const transform_2d FHT_8[] = { | 555 static const transform_2d FHT_8[] = { |
| 561 { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 | 556 { fdct8, fdct8 }, // DCT_DCT = 0 |
| 562 { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 | 557 { fadst8, fdct8 }, // ADST_DCT = 1 |
| 563 { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 | 558 { fdct8, fadst8 }, // DCT_ADST = 2 |
| 564 { fadst8_1d, fadst8_1d } // ADST_ADST = 3 | 559 { fadst8, fadst8 } // ADST_ADST = 3 |
| 565 }; | 560 }; |
| 566 | 561 |
| 567 void vp9_short_fht8x8_c(int16_t *input, int16_t *output, | 562 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, |
| 568 int pitch, TX_TYPE tx_type) { | 563 int stride, int tx_type) { |
| 569 int16_t out[64]; | 564 int16_t out[64]; |
| 570 int16_t *outptr = &out[0]; | 565 int16_t *outptr = &out[0]; |
| 571 int i, j; | 566 int i, j; |
| 572 int16_t temp_in[8], temp_out[8]; | 567 int16_t temp_in[8], temp_out[8]; |
| 573 const transform_2d ht = FHT_8[tx_type]; | 568 const transform_2d ht = FHT_8[tx_type]; |
| 574 | 569 |
| 575 // Columns | 570 // Columns |
| 576 for (i = 0; i < 8; ++i) { | 571 for (i = 0; i < 8; ++i) { |
| 577 for (j = 0; j < 8; ++j) | 572 for (j = 0; j < 8; ++j) |
| 578 temp_in[j] = input[j * pitch + i] << 2; | 573 temp_in[j] = input[j * stride + i] * 4; |
| 579 ht.cols(temp_in, temp_out); | 574 ht.cols(temp_in, temp_out); |
| 580 for (j = 0; j < 8; ++j) | 575 for (j = 0; j < 8; ++j) |
| 581 outptr[j * 8 + i] = temp_out[j]; | 576 outptr[j * 8 + i] = temp_out[j]; |
| 582 } | 577 } |
| 583 | 578 |
| 584 // Rows | 579 // Rows |
| 585 for (i = 0; i < 8; ++i) { | 580 for (i = 0; i < 8; ++i) { |
| 586 for (j = 0; j < 8; ++j) | 581 for (j = 0; j < 8; ++j) |
| 587 temp_in[j] = out[j + i * 8]; | 582 temp_in[j] = out[j + i * 8]; |
| 588 ht.rows(temp_in, temp_out); | 583 ht.rows(temp_in, temp_out); |
| 589 for (j = 0; j < 8; ++j) | 584 for (j = 0; j < 8; ++j) |
| 590 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 585 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
| 591 } | 586 } |
| 592 } | 587 } |
| 593 | 588 |
| 594 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 589 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
| 595 pixel. */ | 590 pixel. */ |
| 596 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { | 591 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { |
| 597 int i; | 592 int i; |
| 598 int a1, b1, c1, d1, e1; | 593 int a1, b1, c1, d1, e1; |
| 599 short *ip = input; | 594 const int16_t *ip = input; |
| 600 short *op = output; | 595 int16_t *op = output; |
| 601 int pitch_short = pitch >> 1; | |
| 602 | 596 |
| 603 for (i = 0; i < 4; i++) { | 597 for (i = 0; i < 4; i++) { |
| 604 a1 = ip[0 * pitch_short]; | 598 a1 = ip[0 * stride]; |
| 605 b1 = ip[1 * pitch_short]; | 599 b1 = ip[1 * stride]; |
| 606 c1 = ip[2 * pitch_short]; | 600 c1 = ip[2 * stride]; |
| 607 d1 = ip[3 * pitch_short]; | 601 d1 = ip[3 * stride]; |
| 608 | 602 |
| 609 a1 += b1; | 603 a1 += b1; |
| 610 d1 = d1 - c1; | 604 d1 = d1 - c1; |
| 611 e1 = (a1 - d1) >> 1; | 605 e1 = (a1 - d1) >> 1; |
| 612 b1 = e1 - b1; | 606 b1 = e1 - b1; |
| 613 c1 = e1 - c1; | 607 c1 = e1 - c1; |
| 614 a1 -= c1; | 608 a1 -= c1; |
| 615 d1 += b1; | 609 d1 += b1; |
| 616 op[0] = a1; | 610 op[0] = a1; |
| 617 op[4] = c1; | 611 op[4] = c1; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 630 c1 = ip[2]; | 624 c1 = ip[2]; |
| 631 d1 = ip[3]; | 625 d1 = ip[3]; |
| 632 | 626 |
| 633 a1 += b1; | 627 a1 += b1; |
| 634 d1 -= c1; | 628 d1 -= c1; |
| 635 e1 = (a1 - d1) >> 1; | 629 e1 = (a1 - d1) >> 1; |
| 636 b1 = e1 - b1; | 630 b1 = e1 - b1; |
| 637 c1 = e1 - c1; | 631 c1 = e1 - c1; |
| 638 a1 -= c1; | 632 a1 -= c1; |
| 639 d1 += b1; | 633 d1 += b1; |
| 640 op[0] = a1 << WHT_UPSCALE_FACTOR; | 634 op[0] = a1 * UNIT_QUANT_FACTOR; |
| 641 op[1] = c1 << WHT_UPSCALE_FACTOR; | 635 op[1] = c1 * UNIT_QUANT_FACTOR; |
| 642 op[2] = d1 << WHT_UPSCALE_FACTOR; | 636 op[2] = d1 * UNIT_QUANT_FACTOR; |
| 643 op[3] = b1 << WHT_UPSCALE_FACTOR; | 637 op[3] = b1 * UNIT_QUANT_FACTOR; |
| 644 | 638 |
| 645 ip += 4; | 639 ip += 4; |
| 646 op += 4; | 640 op += 4; |
| 647 } | 641 } |
| 648 } | 642 } |
| 649 | 643 |
| 650 void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { | |
| 651 vp9_short_walsh4x4_c(input, output, pitch); | |
| 652 vp9_short_walsh4x4_c(input + 4, output + 16, pitch); | |
| 653 } | |
| 654 | |
| 655 | |
| 656 // Rewrote to use same algorithm as others. | 644 // Rewrote to use same algorithm as others. |
| 657 static void fdct16_1d(int16_t in[16], int16_t out[16]) { | 645 static void fdct16(const int16_t in[16], int16_t out[16]) { |
| 658 /*canbe16*/ int step1[8]; | 646 /*canbe16*/ int step1[8]; |
| 659 /*canbe16*/ int step2[8]; | 647 /*canbe16*/ int step2[8]; |
| 660 /*canbe16*/ int step3[8]; | 648 /*canbe16*/ int step3[8]; |
| 661 /*canbe16*/ int input[8]; | 649 /*canbe16*/ int input[8]; |
| 662 /*needs32*/ int temp1, temp2; | 650 /*needs32*/ int temp1, temp2; |
| 663 | 651 |
| 664 // step 1 | 652 // step 1 |
| 665 input[0] = in[0] + in[15]; | 653 input[0] = in[0] + in[15]; |
| 666 input[1] = in[1] + in[14]; | 654 input[1] = in[1] + in[14]; |
| 667 input[2] = in[2] + in[13]; | 655 input[2] = in[2] + in[13]; |
| 668 input[3] = in[3] + in[12]; | 656 input[3] = in[3] + in[12]; |
| 669 input[4] = in[4] + in[11]; | 657 input[4] = in[4] + in[11]; |
| 670 input[5] = in[5] + in[10]; | 658 input[5] = in[5] + in[10]; |
| 671 input[6] = in[6] + in[ 9]; | 659 input[6] = in[6] + in[ 9]; |
| 672 input[7] = in[7] + in[ 8]; | 660 input[7] = in[7] + in[ 8]; |
| 673 | 661 |
| 674 step1[0] = in[7] - in[ 8]; | 662 step1[0] = in[7] - in[ 8]; |
| 675 step1[1] = in[6] - in[ 9]; | 663 step1[1] = in[6] - in[ 9]; |
| 676 step1[2] = in[5] - in[10]; | 664 step1[2] = in[5] - in[10]; |
| 677 step1[3] = in[4] - in[11]; | 665 step1[3] = in[4] - in[11]; |
| 678 step1[4] = in[3] - in[12]; | 666 step1[4] = in[3] - in[12]; |
| 679 step1[5] = in[2] - in[13]; | 667 step1[5] = in[2] - in[13]; |
| 680 step1[6] = in[1] - in[14]; | 668 step1[6] = in[1] - in[14]; |
| 681 step1[7] = in[0] - in[15]; | 669 step1[7] = in[0] - in[15]; |
| 682 | 670 |
| 683 // fdct8_1d(step, step); | 671 // fdct8(step, step); |
| 684 { | 672 { |
| 685 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 673 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
| 686 /*needs32*/ int t0, t1, t2, t3; | 674 /*needs32*/ int t0, t1, t2, t3; |
| 687 /*canbe16*/ int x0, x1, x2, x3; | 675 /*canbe16*/ int x0, x1, x2, x3; |
| 688 | 676 |
| 689 // stage 1 | 677 // stage 1 |
| 690 s0 = input[0] + input[7]; | 678 s0 = input[0] + input[7]; |
| 691 s1 = input[1] + input[6]; | 679 s1 = input[1] + input[6]; |
| 692 s2 = input[2] + input[5]; | 680 s2 = input[2] + input[5]; |
| 693 s3 = input[3] + input[4]; | 681 s3 = input[3] + input[4]; |
| 694 s4 = input[3] - input[4]; | 682 s4 = input[3] - input[4]; |
| 695 s5 = input[2] - input[5]; | 683 s5 = input[2] - input[5]; |
| 696 s6 = input[1] - input[6]; | 684 s6 = input[1] - input[6]; |
| 697 s7 = input[0] - input[7]; | 685 s7 = input[0] - input[7]; |
| 698 | 686 |
| 699 // fdct4_1d(step, step); | 687 // fdct4(step, step); |
| 700 x0 = s0 + s3; | 688 x0 = s0 + s3; |
| 701 x1 = s1 + s2; | 689 x1 = s1 + s2; |
| 702 x2 = s1 - s2; | 690 x2 = s1 - s2; |
| 703 x3 = s0 - s3; | 691 x3 = s0 - s3; |
| 704 t0 = (x0 + x1) * cospi_16_64; | 692 t0 = (x0 + x1) * cospi_16_64; |
| 705 t1 = (x0 - x1) * cospi_16_64; | 693 t1 = (x0 - x1) * cospi_16_64; |
| 706 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; | 694 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
| 707 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; | 695 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
| 708 out[0] = dct_const_round_shift(t0); | 696 out[0] = dct_const_round_shift(t0); |
| 709 out[4] = dct_const_round_shift(t2); | 697 out[4] = dct_const_round_shift(t2); |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 788 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 776 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
| 789 out[3] = dct_const_round_shift(temp1); | 777 out[3] = dct_const_round_shift(temp1); |
| 790 out[11] = dct_const_round_shift(temp2); | 778 out[11] = dct_const_round_shift(temp2); |
| 791 | 779 |
| 792 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 780 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
| 793 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 781 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
| 794 out[7] = dct_const_round_shift(temp1); | 782 out[7] = dct_const_round_shift(temp1); |
| 795 out[15] = dct_const_round_shift(temp2); | 783 out[15] = dct_const_round_shift(temp2); |
| 796 } | 784 } |
| 797 | 785 |
| 798 void fadst16_1d(int16_t *input, int16_t *output) { | 786 static void fadst16(const int16_t *input, int16_t *output) { |
| 799 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 787 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
| 800 | 788 |
| 801 int x0 = input[15]; | 789 int x0 = input[15]; |
| 802 int x1 = input[0]; | 790 int x1 = input[0]; |
| 803 int x2 = input[13]; | 791 int x2 = input[13]; |
| 804 int x3 = input[2]; | 792 int x3 = input[2]; |
| 805 int x4 = input[11]; | 793 int x4 = input[11]; |
| 806 int x5 = input[4]; | 794 int x5 = input[4]; |
| 807 int x6 = input[9]; | 795 int x6 = input[9]; |
| 808 int x7 = input[6]; | 796 int x7 = input[6]; |
| (...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 951 output[9] = x11; | 939 output[9] = x11; |
| 952 output[10] = x15; | 940 output[10] = x15; |
| 953 output[11] = x7; | 941 output[11] = x7; |
| 954 output[12] = x5; | 942 output[12] = x5; |
| 955 output[13] = - x13; | 943 output[13] = - x13; |
| 956 output[14] = x9; | 944 output[14] = x9; |
| 957 output[15] = - x1; | 945 output[15] = - x1; |
| 958 } | 946 } |
| 959 | 947 |
| 960 static const transform_2d FHT_16[] = { | 948 static const transform_2d FHT_16[] = { |
| 961 { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 | 949 { fdct16, fdct16 }, // DCT_DCT = 0 |
| 962 { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 | 950 { fadst16, fdct16 }, // ADST_DCT = 1 |
| 963 { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 | 951 { fdct16, fadst16 }, // DCT_ADST = 2 |
| 964 { fadst16_1d, fadst16_1d } // ADST_ADST = 3 | 952 { fadst16, fadst16 } // ADST_ADST = 3 |
| 965 }; | 953 }; |
| 966 | 954 |
| 967 void vp9_short_fht16x16_c(int16_t *input, int16_t *output, | 955 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, |
| 968 int pitch, TX_TYPE tx_type) { | 956 int stride, int tx_type) { |
| 969 int16_t out[256]; | 957 int16_t out[256]; |
| 970 int16_t *outptr = &out[0]; | 958 int16_t *outptr = &out[0]; |
| 971 int i, j; | 959 int i, j; |
| 972 int16_t temp_in[16], temp_out[16]; | 960 int16_t temp_in[16], temp_out[16]; |
| 973 const transform_2d ht = FHT_16[tx_type]; | 961 const transform_2d ht = FHT_16[tx_type]; |
| 974 | 962 |
| 975 // Columns | 963 // Columns |
| 976 for (i = 0; i < 16; ++i) { | 964 for (i = 0; i < 16; ++i) { |
| 977 for (j = 0; j < 16; ++j) | 965 for (j = 0; j < 16; ++j) |
| 978 temp_in[j] = input[j * pitch + i] << 2; | 966 temp_in[j] = input[j * stride + i] * 4; |
| 979 ht.cols(temp_in, temp_out); | 967 ht.cols(temp_in, temp_out); |
| 980 for (j = 0; j < 16; ++j) | 968 for (j = 0; j < 16; ++j) |
| 981 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 969 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
| 982 // outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 970 // outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 983 } | 971 } |
| 984 | 972 |
| 985 // Rows | 973 // Rows |
| 986 for (i = 0; i < 16; ++i) { | 974 for (i = 0; i < 16; ++i) { |
| 987 for (j = 0; j < 16; ++j) | 975 for (j = 0; j < 16; ++j) |
| 988 temp_in[j] = out[j + i * 16]; | 976 temp_in[j] = out[j + i * 16]; |
| 989 ht.rows(temp_in, temp_out); | 977 ht.rows(temp_in, temp_out); |
| 990 for (j = 0; j < 16; ++j) | 978 for (j = 0; j < 16; ++j) |
| 991 output[j + i * 16] = temp_out[j]; | 979 output[j + i * 16] = temp_out[j]; |
| 992 } | 980 } |
| 993 } | 981 } |
| 994 | 982 |
| 995 static INLINE int dct_32_round(int input) { | 983 static INLINE int dct_32_round(int input) { |
| 996 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 984 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
| 997 assert(-131072 <= rv && rv <= 131071); | 985 assert(-131072 <= rv && rv <= 131071); |
| 998 return rv; | 986 return rv; |
| 999 } | 987 } |
| 1000 | 988 |
| 1001 static INLINE int half_round_shift(int input) { | 989 static INLINE int half_round_shift(int input) { |
| 1002 int rv = (input + 1 + (input < 0)) >> 2; | 990 int rv = (input + 1 + (input < 0)) >> 2; |
| 1003 return rv; | 991 return rv; |
| 1004 } | 992 } |
| 1005 | 993 |
| 1006 static void dct32_1d(int *input, int *output, int round) { | 994 static void dct32_1d(const int *input, int *output, int round) { |
| 1007 int step[32]; | 995 int step[32]; |
| 1008 // Stage 1 | 996 // Stage 1 |
| 1009 step[0] = input[0] + input[(32 - 1)]; | 997 step[0] = input[0] + input[(32 - 1)]; |
| 1010 step[1] = input[1] + input[(32 - 2)]; | 998 step[1] = input[1] + input[(32 - 2)]; |
| 1011 step[2] = input[2] + input[(32 - 3)]; | 999 step[2] = input[2] + input[(32 - 3)]; |
| 1012 step[3] = input[3] + input[(32 - 4)]; | 1000 step[3] = input[3] + input[(32 - 4)]; |
| 1013 step[4] = input[4] + input[(32 - 5)]; | 1001 step[4] = input[4] + input[(32 - 5)]; |
| 1014 step[5] = input[5] + input[(32 - 6)]; | 1002 step[5] = input[5] + input[(32 - 6)]; |
| 1015 step[6] = input[6] + input[(32 - 7)]; | 1003 step[6] = input[6] + input[(32 - 7)]; |
| 1016 step[7] = input[7] + input[(32 - 8)]; | 1004 step[7] = input[7] + input[(32 - 8)]; |
| (...skipping 302 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1319 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 1307 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); |
| 1320 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 1308 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); |
| 1321 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 1309 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); |
| 1322 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 1310 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); |
| 1323 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 1311 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); |
| 1324 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 1312 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); |
| 1325 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 1313 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); |
| 1326 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 1314 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
| 1327 } | 1315 } |
| 1328 | 1316 |
| 1329 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { | 1317 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { |
| 1330 int shortpitch = pitch >> 1; | |
| 1331 int i, j; | 1318 int i, j; |
| 1332 int output[32 * 32]; | 1319 int output[32 * 32]; |
| 1333 | 1320 |
| 1334 // Columns | 1321 // Columns |
| 1335 for (i = 0; i < 32; ++i) { | 1322 for (i = 0; i < 32; ++i) { |
| 1336 int temp_in[32], temp_out[32]; | 1323 int temp_in[32], temp_out[32]; |
| 1337 for (j = 0; j < 32; ++j) | 1324 for (j = 0; j < 32; ++j) |
| 1338 temp_in[j] = input[j * shortpitch + i] << 2; | 1325 temp_in[j] = input[j * stride + i] * 4; |
| 1339 dct32_1d(temp_in, temp_out, 0); | 1326 dct32_1d(temp_in, temp_out, 0); |
| 1340 for (j = 0; j < 32; ++j) | 1327 for (j = 0; j < 32; ++j) |
| 1341 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1328 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 1342 } | 1329 } |
| 1343 | 1330 |
| 1344 // Rows | 1331 // Rows |
| 1345 for (i = 0; i < 32; ++i) { | 1332 for (i = 0; i < 32; ++i) { |
| 1346 int temp_in[32], temp_out[32]; | 1333 int temp_in[32], temp_out[32]; |
| 1347 for (j = 0; j < 32; ++j) | 1334 for (j = 0; j < 32; ++j) |
| 1348 temp_in[j] = output[j + i * 32]; | 1335 temp_in[j] = output[j + i * 32]; |
| 1349 dct32_1d(temp_in, temp_out, 0); | 1336 dct32_1d(temp_in, temp_out, 0); |
| 1350 for (j = 0; j < 32; ++j) | 1337 for (j = 0; j < 32; ++j) |
| 1351 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1338 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
| 1352 } | 1339 } |
| 1353 } | 1340 } |
| 1354 | 1341 |
| 1355 // Note that although we use dct_32_round in dct32_1d computation flow, | 1342 // Note that although we use dct_32_round in dct32_1d computation flow, |
| 1356 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 1343 // this 2d fdct32x32 for rate-distortion optimization loop is operating |
| 1357 // within 16 bits precision. | 1344 // within 16 bits precision. |
| 1358 void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { | 1345 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { |
| 1359 int shortpitch = pitch >> 1; | |
| 1360 int i, j; | 1346 int i, j; |
| 1361 int output[32 * 32]; | 1347 int output[32 * 32]; |
| 1362 | 1348 |
| 1363 // Columns | 1349 // Columns |
| 1364 for (i = 0; i < 32; ++i) { | 1350 for (i = 0; i < 32; ++i) { |
| 1365 int temp_in[32], temp_out[32]; | 1351 int temp_in[32], temp_out[32]; |
| 1366 for (j = 0; j < 32; ++j) | 1352 for (j = 0; j < 32; ++j) |
| 1367 temp_in[j] = input[j * shortpitch + i] << 2; | 1353 temp_in[j] = input[j * stride + i] * 4; |
| 1368 dct32_1d(temp_in, temp_out, 0); | 1354 dct32_1d(temp_in, temp_out, 0); |
| 1369 for (j = 0; j < 32; ++j) | 1355 for (j = 0; j < 32; ++j) |
| 1370 // TODO(cd): see quality impact of only doing | 1356 // TODO(cd): see quality impact of only doing |
| 1371 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 1357 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; |
| 1372 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 1358 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c |
| 1373 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1359 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 1374 } | 1360 } |
| 1375 | 1361 |
| 1376 // Rows | 1362 // Rows |
| 1377 for (i = 0; i < 32; ++i) { | 1363 for (i = 0; i < 32; ++i) { |
| 1378 int temp_in[32], temp_out[32]; | 1364 int temp_in[32], temp_out[32]; |
| 1379 for (j = 0; j < 32; ++j) | 1365 for (j = 0; j < 32; ++j) |
| 1380 temp_in[j] = output[j + i * 32]; | 1366 temp_in[j] = output[j + i * 32]; |
| 1381 dct32_1d(temp_in, temp_out, 1); | 1367 dct32_1d(temp_in, temp_out, 1); |
| 1382 for (j = 0; j < 32; ++j) | 1368 for (j = 0; j < 32; ++j) |
| 1383 out[j + i * 32] = temp_out[j]; | 1369 out[j + i * 32] = temp_out[j]; |
| 1384 } | 1370 } |
| 1385 } | 1371 } |
| 1372 |
| 1373 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1374 int stride) { |
| 1375 if (tx_type == DCT_DCT) |
| 1376 vp9_fdct4x4(input, output, stride); |
| 1377 else |
| 1378 vp9_short_fht4x4(input, output, stride, tx_type); |
| 1379 } |
| 1380 |
| 1381 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1382 int stride) { |
| 1383 if (tx_type == DCT_DCT) |
| 1384 vp9_fdct8x8(input, output, stride); |
| 1385 else |
| 1386 vp9_short_fht8x8(input, output, stride, tx_type); |
| 1387 } |
| 1388 |
| 1389 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1390 int stride) { |
| 1391 if (tx_type == DCT_DCT) |
| 1392 vp9_fdct16x16(input, output, stride); |
| 1393 else |
| 1394 vp9_short_fht16x16(input, output, stride, tx_type); |
| 1395 } |
| OLD | NEW |