OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <math.h> | 12 #include <math.h> |
13 | 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
16 | 16 |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
19 #include "vp9/common/vp9_systemdependent.h" | 19 #include "vp9/common/vp9_systemdependent.h" |
20 | 20 |
21 static INLINE int fdct_round_shift(int input) { | 21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) { |
22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
23 assert(INT16_MIN <= rv && rv <= INT16_MAX); | 23 // TODO(debargha, peter.derivaz): Find new bounds for this assert |
| 24 // and make the bounds consts. |
| 25 // assert(INT16_MIN <= rv && rv <= INT16_MAX); |
24 return rv; | 26 return rv; |
25 } | 27 } |
26 | 28 |
27 static void fdct4(const int16_t *input, int16_t *output) { | 29 static void fdct4(const tran_low_t *input, tran_low_t *output) { |
28 int16_t step[4]; | 30 tran_high_t step[4]; |
29 int temp1, temp2; | 31 tran_high_t temp1, temp2; |
30 | 32 |
31 step[0] = input[0] + input[3]; | 33 step[0] = input[0] + input[3]; |
32 step[1] = input[1] + input[2]; | 34 step[1] = input[1] + input[2]; |
33 step[2] = input[1] - input[2]; | 35 step[2] = input[1] - input[2]; |
34 step[3] = input[0] - input[3]; | 36 step[3] = input[0] - input[3]; |
35 | 37 |
36 temp1 = (step[0] + step[1]) * cospi_16_64; | 38 temp1 = (step[0] + step[1]) * cospi_16_64; |
37 temp2 = (step[0] - step[1]) * cospi_16_64; | 39 temp2 = (step[0] - step[1]) * cospi_16_64; |
38 output[0] = fdct_round_shift(temp1); | 40 output[0] = fdct_round_shift(temp1); |
39 output[2] = fdct_round_shift(temp2); | 41 output[2] = fdct_round_shift(temp2); |
40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
42 output[1] = fdct_round_shift(temp1); | 44 output[1] = fdct_round_shift(temp1); |
43 output[3] = fdct_round_shift(temp2); | 45 output[3] = fdct_round_shift(temp2); |
44 } | 46 } |
45 | 47 |
46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { | 48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { |
47 int r, c; | 49 int r, c; |
48 int16_t sum = 0; | 50 tran_low_t sum = 0; |
49 for (r = 0; r < 4; ++r) | 51 for (r = 0; r < 4; ++r) |
50 for (c = 0; c < 4; ++c) | 52 for (c = 0; c < 4; ++c) |
51 sum += input[r * stride + c]; | 53 sum += input[r * stride + c]; |
52 | 54 |
53 output[0] = sum << 1; | 55 output[0] = sum << 1; |
54 output[1] = 0; | 56 output[1] = 0; |
55 } | 57 } |
56 | 58 |
57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { | 59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
58 // The 2D transform is done with two passes which are actually pretty | 60 // The 2D transform is done with two passes which are actually pretty |
59 // similar. In the first one, we transform the columns and transpose | 61 // similar. In the first one, we transform the columns and transpose |
60 // the results. In the second one, we transform the rows. To achieve that, | 62 // the results. In the second one, we transform the rows. To achieve that, |
61 // as the first pass results are transposed, we transpose the columns (that | 63 // as the first pass results are transposed, we transpose the columns (that |
62 // is the transposed rows) and transpose the results (so that it goes back | 64 // is the transposed rows) and transpose the results (so that it goes back |
63 // in normal/row positions). | 65 // in normal/row positions). |
64 int pass; | 66 int pass; |
65 // We need an intermediate buffer between passes. | 67 // We need an intermediate buffer between passes. |
66 int16_t intermediate[4 * 4]; | 68 tran_low_t intermediate[4 * 4]; |
67 const int16_t *in = input; | 69 const int16_t *in_pass0 = input; |
68 int16_t *out = intermediate; | 70 const tran_low_t *in = NULL; |
| 71 tran_low_t *out = intermediate; |
69 // Do the two transform/transpose passes | 72 // Do the two transform/transpose passes |
70 for (pass = 0; pass < 2; ++pass) { | 73 for (pass = 0; pass < 2; ++pass) { |
71 /*canbe16*/ int input[4]; | 74 tran_high_t input[4]; // canbe16 |
72 /*canbe16*/ int step[4]; | 75 tran_high_t step[4]; // canbe16 |
73 /*needs32*/ int temp1, temp2; | 76 tran_high_t temp1, temp2; // needs32 |
74 int i; | 77 int i; |
75 for (i = 0; i < 4; ++i) { | 78 for (i = 0; i < 4; ++i) { |
76 // Load inputs. | 79 // Load inputs. |
77 if (0 == pass) { | 80 if (0 == pass) { |
78 input[0] = in[0 * stride] * 16; | 81 input[0] = in_pass0[0 * stride] * 16; |
79 input[1] = in[1 * stride] * 16; | 82 input[1] = in_pass0[1 * stride] * 16; |
80 input[2] = in[2 * stride] * 16; | 83 input[2] = in_pass0[2 * stride] * 16; |
81 input[3] = in[3 * stride] * 16; | 84 input[3] = in_pass0[3 * stride] * 16; |
82 if (i == 0 && input[0]) { | 85 if (i == 0 && input[0]) { |
83 input[0] += 1; | 86 input[0] += 1; |
84 } | 87 } |
85 } else { | 88 } else { |
86 input[0] = in[0 * 4]; | 89 input[0] = in[0 * 4]; |
87 input[1] = in[1 * 4]; | 90 input[1] = in[1 * 4]; |
88 input[2] = in[2 * 4]; | 91 input[2] = in[2 * 4]; |
89 input[3] = in[3 * 4]; | 92 input[3] = in[3 * 4]; |
90 } | 93 } |
91 // Transform. | 94 // Transform. |
92 step[0] = input[0] + input[3]; | 95 step[0] = input[0] + input[3]; |
93 step[1] = input[1] + input[2]; | 96 step[1] = input[1] + input[2]; |
94 step[2] = input[1] - input[2]; | 97 step[2] = input[1] - input[2]; |
95 step[3] = input[0] - input[3]; | 98 step[3] = input[0] - input[3]; |
96 temp1 = (step[0] + step[1]) * cospi_16_64; | 99 temp1 = (step[0] + step[1]) * cospi_16_64; |
97 temp2 = (step[0] - step[1]) * cospi_16_64; | 100 temp2 = (step[0] - step[1]) * cospi_16_64; |
98 out[0] = fdct_round_shift(temp1); | 101 out[0] = fdct_round_shift(temp1); |
99 out[2] = fdct_round_shift(temp2); | 102 out[2] = fdct_round_shift(temp2); |
100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
102 out[1] = fdct_round_shift(temp1); | 105 out[1] = fdct_round_shift(temp1); |
103 out[3] = fdct_round_shift(temp2); | 106 out[3] = fdct_round_shift(temp2); |
104 // Do next column (which is a transposed row in second/horizontal pass) | 107 // Do next column (which is a transposed row in second/horizontal pass) |
| 108 in_pass0++; |
105 in++; | 109 in++; |
106 out += 4; | 110 out += 4; |
107 } | 111 } |
108 // Setup in/out for next pass. | 112 // Setup in/out for next pass. |
109 in = intermediate; | 113 in = intermediate; |
110 out = output; | 114 out = output; |
111 } | 115 } |
112 | 116 |
113 { | 117 { |
114 int i, j; | 118 int i, j; |
115 for (i = 0; i < 4; ++i) { | 119 for (i = 0; i < 4; ++i) { |
116 for (j = 0; j < 4; ++j) | 120 for (j = 0; j < 4; ++j) |
117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; |
118 } | 122 } |
119 } | 123 } |
120 } | 124 } |
121 | 125 |
122 static void fadst4(const int16_t *input, int16_t *output) { | 126 static void fadst4(const tran_low_t *input, tran_low_t *output) { |
123 int x0, x1, x2, x3; | 127 tran_high_t x0, x1, x2, x3; |
124 int s0, s1, s2, s3, s4, s5, s6, s7; | 128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
125 | 129 |
126 x0 = input[0]; | 130 x0 = input[0]; |
127 x1 = input[1]; | 131 x1 = input[1]; |
128 x2 = input[2]; | 132 x2 = input[2]; |
129 x3 = input[3]; | 133 x3 = input[3]; |
130 | 134 |
131 if (!(x0 | x1 | x2 | x3)) { | 135 if (!(x0 | x1 | x2 | x3)) { |
132 output[0] = output[1] = output[2] = output[3] = 0; | 136 output[0] = output[1] = output[2] = output[3] = 0; |
133 return; | 137 return; |
134 } | 138 } |
(...skipping 24 matching lines...) Expand all Loading... |
159 output[3] = fdct_round_shift(s3); | 163 output[3] = fdct_round_shift(s3); |
160 } | 164 } |
161 | 165 |
162 static const transform_2d FHT_4[] = { | 166 static const transform_2d FHT_4[] = { |
163 { fdct4, fdct4 }, // DCT_DCT = 0 | 167 { fdct4, fdct4 }, // DCT_DCT = 0 |
164 { fadst4, fdct4 }, // ADST_DCT = 1 | 168 { fadst4, fdct4 }, // ADST_DCT = 1 |
165 { fdct4, fadst4 }, // DCT_ADST = 2 | 169 { fdct4, fadst4 }, // DCT_ADST = 2 |
166 { fadst4, fadst4 } // ADST_ADST = 3 | 170 { fadst4, fadst4 } // ADST_ADST = 3 |
167 }; | 171 }; |
168 | 172 |
169 void vp9_fht4x4_c(const int16_t *input, int16_t *output, | 173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, |
170 int stride, int tx_type) { | 174 int stride, int tx_type) { |
171 if (tx_type == DCT_DCT) { | 175 if (tx_type == DCT_DCT) { |
172 vp9_fdct4x4_c(input, output, stride); | 176 vp9_fdct4x4_c(input, output, stride); |
173 } else { | 177 } else { |
174 int16_t out[4 * 4]; | 178 tran_low_t out[4 * 4]; |
175 int16_t *outptr = &out[0]; | 179 tran_low_t *outptr = &out[0]; |
176 int i, j; | 180 int i, j; |
177 int16_t temp_in[4], temp_out[4]; | 181 tran_low_t temp_in[4], temp_out[4]; |
178 const transform_2d ht = FHT_4[tx_type]; | 182 const transform_2d ht = FHT_4[tx_type]; |
179 | 183 |
180 // Columns | 184 // Columns |
181 for (i = 0; i < 4; ++i) { | 185 for (i = 0; i < 4; ++i) { |
182 for (j = 0; j < 4; ++j) | 186 for (j = 0; j < 4; ++j) |
183 temp_in[j] = input[j * stride + i] * 16; | 187 temp_in[j] = input[j * stride + i] * 16; |
184 if (i == 0 && temp_in[0]) | 188 if (i == 0 && temp_in[0]) |
185 temp_in[0] += 1; | 189 temp_in[0] += 1; |
186 ht.cols(temp_in, temp_out); | 190 ht.cols(temp_in, temp_out); |
187 for (j = 0; j < 4; ++j) | 191 for (j = 0; j < 4; ++j) |
188 outptr[j * 4 + i] = temp_out[j]; | 192 outptr[j * 4 + i] = temp_out[j]; |
189 } | 193 } |
190 | 194 |
191 // Rows | 195 // Rows |
192 for (i = 0; i < 4; ++i) { | 196 for (i = 0; i < 4; ++i) { |
193 for (j = 0; j < 4; ++j) | 197 for (j = 0; j < 4; ++j) |
194 temp_in[j] = out[j + i * 4]; | 198 temp_in[j] = out[j + i * 4]; |
195 ht.rows(temp_in, temp_out); | 199 ht.rows(temp_in, temp_out); |
196 for (j = 0; j < 4; ++j) | 200 for (j = 0; j < 4; ++j) |
197 output[j + i * 4] = (temp_out[j] + 1) >> 2; | 201 output[j + i * 4] = (temp_out[j] + 1) >> 2; |
198 } | 202 } |
199 } | 203 } |
200 } | 204 } |
201 | 205 |
202 static void fdct8(const int16_t *input, int16_t *output) { | 206 static void fdct8(const tran_low_t *input, tran_low_t *output) { |
203 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
204 /*needs32*/ int t0, t1, t2, t3; | 208 tran_high_t t0, t1, t2, t3; // needs32 |
205 /*canbe16*/ int x0, x1, x2, x3; | 209 tran_high_t x0, x1, x2, x3; // canbe16 |
206 | 210 |
207 // stage 1 | 211 // stage 1 |
208 s0 = input[0] + input[7]; | 212 s0 = input[0] + input[7]; |
209 s1 = input[1] + input[6]; | 213 s1 = input[1] + input[6]; |
210 s2 = input[2] + input[5]; | 214 s2 = input[2] + input[5]; |
211 s3 = input[3] + input[4]; | 215 s3 = input[3] + input[4]; |
212 s4 = input[3] - input[4]; | 216 s4 = input[3] - input[4]; |
213 s5 = input[2] - input[5]; | 217 s5 = input[2] - input[5]; |
214 s6 = input[1] - input[6]; | 218 s6 = input[1] - input[6]; |
215 s7 = input[0] - input[7]; | 219 s7 = input[0] - input[7]; |
(...skipping 28 matching lines...) Expand all Loading... |
244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; | 248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; | 249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; | 251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
248 output[1] = fdct_round_shift(t0); | 252 output[1] = fdct_round_shift(t0); |
249 output[3] = fdct_round_shift(t2); | 253 output[3] = fdct_round_shift(t2); |
250 output[5] = fdct_round_shift(t1); | 254 output[5] = fdct_round_shift(t1); |
251 output[7] = fdct_round_shift(t3); | 255 output[7] = fdct_round_shift(t3); |
252 } | 256 } |
253 | 257 |
254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { | 258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { |
255 int r, c; | 259 int r, c; |
256 int16_t sum = 0; | 260 tran_low_t sum = 0; |
257 for (r = 0; r < 8; ++r) | 261 for (r = 0; r < 8; ++r) |
258 for (c = 0; c < 8; ++c) | 262 for (c = 0; c < 8; ++c) |
259 sum += input[r * stride + c]; | 263 sum += input[r * stride + c]; |
260 | 264 |
261 output[0] = sum; | 265 output[0] = sum; |
262 output[1] = 0; | 266 output[1] = 0; |
263 } | 267 } |
264 | 268 |
265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { | 269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { |
266 int i, j; | 270 int i, j; |
267 int16_t intermediate[64]; | 271 tran_low_t intermediate[64]; |
268 | 272 |
269 // Transform columns | 273 // Transform columns |
270 { | 274 { |
271 int16_t *output = intermediate; | 275 tran_low_t *output = intermediate; |
272 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
273 /*needs32*/ int t0, t1, t2, t3; | 277 tran_high_t t0, t1, t2, t3; // needs32 |
274 /*canbe16*/ int x0, x1, x2, x3; | 278 tran_high_t x0, x1, x2, x3; // canbe16 |
275 | 279 |
276 int i; | 280 int i; |
277 for (i = 0; i < 8; i++) { | 281 for (i = 0; i < 8; i++) { |
278 // stage 1 | 282 // stage 1 |
279 s0 = (input[0 * stride] + input[7 * stride]) * 4; | 283 s0 = (input[0 * stride] + input[7 * stride]) * 4; |
280 s1 = (input[1 * stride] + input[6 * stride]) * 4; | 284 s1 = (input[1 * stride] + input[6 * stride]) * 4; |
281 s2 = (input[2 * stride] + input[5 * stride]) * 4; | 285 s2 = (input[2 * stride] + input[5 * stride]) * 4; |
282 s3 = (input[3 * stride] + input[4 * stride]) * 4; | 286 s3 = (input[3 * stride] + input[4 * stride]) * 4; |
283 s4 = (input[3 * stride] - input[4 * stride]) * 4; | 287 s4 = (input[3 * stride] - input[4 * stride]) * 4; |
284 s5 = (input[2 * stride] - input[5 * stride]) * 4; | 288 s5 = (input[2 * stride] - input[5 * stride]) * 4; |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
326 } | 330 } |
327 | 331 |
328 // Rows | 332 // Rows |
329 for (i = 0; i < 8; ++i) { | 333 for (i = 0; i < 8; ++i) { |
330 fdct8(&intermediate[i * 8], &final_output[i * 8]); | 334 fdct8(&intermediate[i * 8], &final_output[i * 8]); |
331 for (j = 0; j < 8; ++j) | 335 for (j = 0; j < 8; ++j) |
332 final_output[j + i * 8] /= 2; | 336 final_output[j + i * 8] /= 2; |
333 } | 337 } |
334 } | 338 } |
335 | 339 |
336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { | 340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { |
337 int r, c; | 341 int r, c; |
338 int16_t sum = 0; | 342 tran_low_t sum = 0; |
339 for (r = 0; r < 16; ++r) | 343 for (r = 0; r < 16; ++r) |
340 for (c = 0; c < 16; ++c) | 344 for (c = 0; c < 16; ++c) |
341 sum += input[r * stride + c]; | 345 sum += input[r * stride + c]; |
342 | 346 |
343 output[0] = sum >> 1; | 347 output[0] = sum >> 1; |
344 output[1] = 0; | 348 output[1] = 0; |
345 } | 349 } |
346 | 350 |
347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { | 351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { |
348 // The 2D transform is done with two passes which are actually pretty | 352 // The 2D transform is done with two passes which are actually pretty |
349 // similar. In the first one, we transform the columns and transpose | 353 // similar. In the first one, we transform the columns and transpose |
350 // the results. In the second one, we transform the rows. To achieve that, | 354 // the results. In the second one, we transform the rows. To achieve that, |
351 // as the first pass results are transposed, we transpose the columns (that | 355 // as the first pass results are transposed, we transpose the columns (that |
352 // is the transposed rows) and transpose the results (so that it goes back | 356 // is the transposed rows) and transpose the results (so that it goes back |
353 // in normal/row positions). | 357 // in normal/row positions). |
354 int pass; | 358 int pass; |
355 // We need an intermediate buffer between passes. | 359 // We need an intermediate buffer between passes. |
356 int16_t intermediate[256]; | 360 tran_low_t intermediate[256]; |
357 const int16_t *in = input; | 361 const int16_t *in_pass0 = input; |
358 int16_t *out = intermediate; | 362 const tran_low_t *in = NULL; |
| 363 tran_low_t *out = intermediate; |
359 // Do the two transform/transpose passes | 364 // Do the two transform/transpose passes |
360 for (pass = 0; pass < 2; ++pass) { | 365 for (pass = 0; pass < 2; ++pass) { |
361 /*canbe16*/ int step1[8]; | 366 tran_high_t step1[8]; // canbe16 |
362 /*canbe16*/ int step2[8]; | 367 tran_high_t step2[8]; // canbe16 |
363 /*canbe16*/ int step3[8]; | 368 tran_high_t step3[8]; // canbe16 |
364 /*canbe16*/ int input[8]; | 369 tran_high_t input[8]; // canbe16 |
365 /*needs32*/ int temp1, temp2; | 370 tran_high_t temp1, temp2; // needs32 |
366 int i; | 371 int i; |
367 for (i = 0; i < 16; i++) { | 372 for (i = 0; i < 16; i++) { |
368 if (0 == pass) { | 373 if (0 == pass) { |
369 // Calculate input for the first 8 results. | 374 // Calculate input for the first 8 results. |
370 input[0] = (in[0 * stride] + in[15 * stride]) * 4; | 375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; |
371 input[1] = (in[1 * stride] + in[14 * stride]) * 4; | 376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; |
372 input[2] = (in[2 * stride] + in[13 * stride]) * 4; | 377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; |
373 input[3] = (in[3 * stride] + in[12 * stride]) * 4; | 378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; |
374 input[4] = (in[4 * stride] + in[11 * stride]) * 4; | 379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; |
375 input[5] = (in[5 * stride] + in[10 * stride]) * 4; | 380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; |
376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; | 381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; |
377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; | 382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; |
378 // Calculate input for the next 8 results. | 383 // Calculate input for the next 8 results. |
379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; | 384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; |
380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; | 385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; |
381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; | 386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; |
382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; | 387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; |
383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; | 388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; |
384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; | 389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; |
385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; | 390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; |
386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; | 391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; |
387 } else { | 392 } else { |
388 // Calculate input for the first 8 results. | 393 // Calculate input for the first 8 results. |
389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); |
390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); |
391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); |
392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); |
393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); |
394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); |
395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); |
396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); |
397 // Calculate input for the next 8 results. | 402 // Calculate input for the next 8 results. |
398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); |
399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); |
400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); |
401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); |
402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); |
403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); |
404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); |
405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); |
406 } | 411 } |
407 // Work on the first eight values; fdct8(input, even_results); | 412 // Work on the first eight values; fdct8(input, even_results); |
408 { | 413 { |
409 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
410 /*needs32*/ int t0, t1, t2, t3; | 415 tran_high_t t0, t1, t2, t3; // needs32 |
411 /*canbe16*/ int x0, x1, x2, x3; | 416 tran_high_t x0, x1, x2, x3; // canbe16 |
412 | 417 |
413 // stage 1 | 418 // stage 1 |
414 s0 = input[0] + input[7]; | 419 s0 = input[0] + input[7]; |
415 s1 = input[1] + input[6]; | 420 s1 = input[1] + input[6]; |
416 s2 = input[2] + input[5]; | 421 s2 = input[2] + input[5]; |
417 s3 = input[3] + input[4]; | 422 s3 = input[3] + input[4]; |
418 s4 = input[3] - input[4]; | 423 s4 = input[3] - input[4]; |
419 s5 = input[2] - input[5]; | 424 s5 = input[2] - input[5]; |
420 s6 = input[1] - input[6]; | 425 s6 = input[1] - input[6]; |
421 s7 = input[0] - input[7]; | 426 s7 = input[0] - input[7]; |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
508 out[3] = fdct_round_shift(temp1); | 513 out[3] = fdct_round_shift(temp1); |
509 out[11] = fdct_round_shift(temp2); | 514 out[11] = fdct_round_shift(temp2); |
510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
512 out[7] = fdct_round_shift(temp1); | 517 out[7] = fdct_round_shift(temp1); |
513 out[15] = fdct_round_shift(temp2); | 518 out[15] = fdct_round_shift(temp2); |
514 } | 519 } |
515 // Do next column (which is a transposed row in second/horizontal pass) | 520 // Do next column (which is a transposed row in second/horizontal pass) |
516 in++; | 521 in++; |
| 522 in_pass0++; |
517 out += 16; | 523 out += 16; |
518 } | 524 } |
519 // Setup in/out for next pass. | 525 // Setup in/out for next pass. |
520 in = intermediate; | 526 in = intermediate; |
521 out = output; | 527 out = output; |
522 } | 528 } |
523 } | 529 } |
524 | 530 |
525 static void fadst8(const int16_t *input, int16_t *output) { | 531 static void fadst8(const tran_low_t *input, tran_low_t *output) { |
526 int s0, s1, s2, s3, s4, s5, s6, s7; | 532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
527 | 533 |
528 int x0 = input[7]; | 534 tran_high_t x0 = input[7]; |
529 int x1 = input[0]; | 535 tran_high_t x1 = input[0]; |
530 int x2 = input[5]; | 536 tran_high_t x2 = input[5]; |
531 int x3 = input[2]; | 537 tran_high_t x3 = input[2]; |
532 int x4 = input[3]; | 538 tran_high_t x4 = input[3]; |
533 int x5 = input[4]; | 539 tran_high_t x5 = input[4]; |
534 int x6 = input[1]; | 540 tran_high_t x6 = input[1]; |
535 int x7 = input[6]; | 541 tran_high_t x7 = input[6]; |
536 | 542 |
537 // stage 1 | 543 // stage 1 |
538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | 544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | 545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | 546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | 547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | 548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | 549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; | 550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; | 551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
593 output[7] = - x1; | 599 output[7] = - x1; |
594 } | 600 } |
595 | 601 |
596 static const transform_2d FHT_8[] = { | 602 static const transform_2d FHT_8[] = { |
597 { fdct8, fdct8 }, // DCT_DCT = 0 | 603 { fdct8, fdct8 }, // DCT_DCT = 0 |
598 { fadst8, fdct8 }, // ADST_DCT = 1 | 604 { fadst8, fdct8 }, // ADST_DCT = 1 |
599 { fdct8, fadst8 }, // DCT_ADST = 2 | 605 { fdct8, fadst8 }, // DCT_ADST = 2 |
600 { fadst8, fadst8 } // ADST_ADST = 3 | 606 { fadst8, fadst8 } // ADST_ADST = 3 |
601 }; | 607 }; |
602 | 608 |
603 void vp9_fht8x8_c(const int16_t *input, int16_t *output, | 609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, |
604 int stride, int tx_type) { | 610 int stride, int tx_type) { |
605 if (tx_type == DCT_DCT) { | 611 if (tx_type == DCT_DCT) { |
606 vp9_fdct8x8_c(input, output, stride); | 612 vp9_fdct8x8_c(input, output, stride); |
607 } else { | 613 } else { |
608 int16_t out[64]; | 614 tran_low_t out[64]; |
609 int16_t *outptr = &out[0]; | 615 tran_low_t *outptr = &out[0]; |
610 int i, j; | 616 int i, j; |
611 int16_t temp_in[8], temp_out[8]; | 617 tran_low_t temp_in[8], temp_out[8]; |
612 const transform_2d ht = FHT_8[tx_type]; | 618 const transform_2d ht = FHT_8[tx_type]; |
613 | 619 |
614 // Columns | 620 // Columns |
615 for (i = 0; i < 8; ++i) { | 621 for (i = 0; i < 8; ++i) { |
616 for (j = 0; j < 8; ++j) | 622 for (j = 0; j < 8; ++j) |
617 temp_in[j] = input[j * stride + i] * 4; | 623 temp_in[j] = input[j * stride + i] * 4; |
618 ht.cols(temp_in, temp_out); | 624 ht.cols(temp_in, temp_out); |
619 for (j = 0; j < 8; ++j) | 625 for (j = 0; j < 8; ++j) |
620 outptr[j * 8 + i] = temp_out[j]; | 626 outptr[j * 8 + i] = temp_out[j]; |
621 } | 627 } |
622 | 628 |
623 // Rows | 629 // Rows |
624 for (i = 0; i < 8; ++i) { | 630 for (i = 0; i < 8; ++i) { |
625 for (j = 0; j < 8; ++j) | 631 for (j = 0; j < 8; ++j) |
626 temp_in[j] = out[j + i * 8]; | 632 temp_in[j] = out[j + i * 8]; |
627 ht.rows(temp_in, temp_out); | 633 ht.rows(temp_in, temp_out); |
628 for (j = 0; j < 8; ++j) | 634 for (j = 0; j < 8; ++j) |
629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
630 } | 636 } |
631 } | 637 } |
632 } | 638 } |
633 | 639 |
634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
635 pixel. */ | 641 pixel. */ |
636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { | 642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
637 int i; | 643 int i; |
638 int a1, b1, c1, d1, e1; | 644 tran_high_t a1, b1, c1, d1, e1; |
639 const int16_t *ip = input; | 645 const int16_t *ip_pass0 = input; |
640 int16_t *op = output; | 646 const tran_low_t *ip = NULL; |
| 647 tran_low_t *op = output; |
641 | 648 |
642 for (i = 0; i < 4; i++) { | 649 for (i = 0; i < 4; i++) { |
643 a1 = ip[0 * stride]; | 650 a1 = ip_pass0[0 * stride]; |
644 b1 = ip[1 * stride]; | 651 b1 = ip_pass0[1 * stride]; |
645 c1 = ip[2 * stride]; | 652 c1 = ip_pass0[2 * stride]; |
646 d1 = ip[3 * stride]; | 653 d1 = ip_pass0[3 * stride]; |
647 | 654 |
648 a1 += b1; | 655 a1 += b1; |
649 d1 = d1 - c1; | 656 d1 = d1 - c1; |
650 e1 = (a1 - d1) >> 1; | 657 e1 = (a1 - d1) >> 1; |
651 b1 = e1 - b1; | 658 b1 = e1 - b1; |
652 c1 = e1 - c1; | 659 c1 = e1 - c1; |
653 a1 -= c1; | 660 a1 -= c1; |
654 d1 += b1; | 661 d1 += b1; |
655 op[0] = a1; | 662 op[0] = a1; |
656 op[4] = c1; | 663 op[4] = c1; |
657 op[8] = d1; | 664 op[8] = d1; |
658 op[12] = b1; | 665 op[12] = b1; |
659 | 666 |
660 ip++; | 667 ip_pass0++; |
661 op++; | 668 op++; |
662 } | 669 } |
663 ip = output; | 670 ip = output; |
664 op = output; | 671 op = output; |
665 | 672 |
666 for (i = 0; i < 4; i++) { | 673 for (i = 0; i < 4; i++) { |
667 a1 = ip[0]; | 674 a1 = ip[0]; |
668 b1 = ip[1]; | 675 b1 = ip[1]; |
669 c1 = ip[2]; | 676 c1 = ip[2]; |
670 d1 = ip[3]; | 677 d1 = ip[3]; |
671 | 678 |
672 a1 += b1; | 679 a1 += b1; |
673 d1 -= c1; | 680 d1 -= c1; |
674 e1 = (a1 - d1) >> 1; | 681 e1 = (a1 - d1) >> 1; |
675 b1 = e1 - b1; | 682 b1 = e1 - b1; |
676 c1 = e1 - c1; | 683 c1 = e1 - c1; |
677 a1 -= c1; | 684 a1 -= c1; |
678 d1 += b1; | 685 d1 += b1; |
679 op[0] = a1 * UNIT_QUANT_FACTOR; | 686 op[0] = a1 * UNIT_QUANT_FACTOR; |
680 op[1] = c1 * UNIT_QUANT_FACTOR; | 687 op[1] = c1 * UNIT_QUANT_FACTOR; |
681 op[2] = d1 * UNIT_QUANT_FACTOR; | 688 op[2] = d1 * UNIT_QUANT_FACTOR; |
682 op[3] = b1 * UNIT_QUANT_FACTOR; | 689 op[3] = b1 * UNIT_QUANT_FACTOR; |
683 | 690 |
684 ip += 4; | 691 ip += 4; |
685 op += 4; | 692 op += 4; |
686 } | 693 } |
687 } | 694 } |
688 | 695 |
689 // Rewrote to use same algorithm as others. | 696 // Rewrote to use same algorithm as others. |
690 static void fdct16(const int16_t in[16], int16_t out[16]) { | 697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { |
691 /*canbe16*/ int step1[8]; | 698 tran_high_t step1[8]; // canbe16 |
692 /*canbe16*/ int step2[8]; | 699 tran_high_t step2[8]; // canbe16 |
693 /*canbe16*/ int step3[8]; | 700 tran_high_t step3[8]; // canbe16 |
694 /*canbe16*/ int input[8]; | 701 tran_high_t input[8]; // canbe16 |
695 /*needs32*/ int temp1, temp2; | 702 tran_high_t temp1, temp2; // needs32 |
696 | 703 |
697 // step 1 | 704 // step 1 |
698 input[0] = in[0] + in[15]; | 705 input[0] = in[0] + in[15]; |
699 input[1] = in[1] + in[14]; | 706 input[1] = in[1] + in[14]; |
700 input[2] = in[2] + in[13]; | 707 input[2] = in[2] + in[13]; |
701 input[3] = in[3] + in[12]; | 708 input[3] = in[3] + in[12]; |
702 input[4] = in[4] + in[11]; | 709 input[4] = in[4] + in[11]; |
703 input[5] = in[5] + in[10]; | 710 input[5] = in[5] + in[10]; |
704 input[6] = in[6] + in[ 9]; | 711 input[6] = in[6] + in[ 9]; |
705 input[7] = in[7] + in[ 8]; | 712 input[7] = in[7] + in[ 8]; |
706 | 713 |
707 step1[0] = in[7] - in[ 8]; | 714 step1[0] = in[7] - in[ 8]; |
708 step1[1] = in[6] - in[ 9]; | 715 step1[1] = in[6] - in[ 9]; |
709 step1[2] = in[5] - in[10]; | 716 step1[2] = in[5] - in[10]; |
710 step1[3] = in[4] - in[11]; | 717 step1[3] = in[4] - in[11]; |
711 step1[4] = in[3] - in[12]; | 718 step1[4] = in[3] - in[12]; |
712 step1[5] = in[2] - in[13]; | 719 step1[5] = in[2] - in[13]; |
713 step1[6] = in[1] - in[14]; | 720 step1[6] = in[1] - in[14]; |
714 step1[7] = in[0] - in[15]; | 721 step1[7] = in[0] - in[15]; |
715 | 722 |
716 // fdct8(step, step); | 723 // fdct8(step, step); |
717 { | 724 { |
718 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
719 /*needs32*/ int t0, t1, t2, t3; | 726 tran_high_t t0, t1, t2, t3; // needs32 |
720 /*canbe16*/ int x0, x1, x2, x3; | 727 tran_high_t x0, x1, x2, x3; // canbe16 |
721 | 728 |
722 // stage 1 | 729 // stage 1 |
723 s0 = input[0] + input[7]; | 730 s0 = input[0] + input[7]; |
724 s1 = input[1] + input[6]; | 731 s1 = input[1] + input[6]; |
725 s2 = input[2] + input[5]; | 732 s2 = input[2] + input[5]; |
726 s3 = input[3] + input[4]; | 733 s3 = input[3] + input[4]; |
727 s4 = input[3] - input[4]; | 734 s4 = input[3] - input[4]; |
728 s5 = input[2] - input[5]; | 735 s5 = input[2] - input[5]; |
729 s6 = input[1] - input[6]; | 736 s6 = input[1] - input[6]; |
730 s7 = input[0] - input[7]; | 737 s7 = input[0] - input[7]; |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
822 out[3] = fdct_round_shift(temp1); | 829 out[3] = fdct_round_shift(temp1); |
823 out[11] = fdct_round_shift(temp2); | 830 out[11] = fdct_round_shift(temp2); |
824 | 831 |
825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
827 out[7] = fdct_round_shift(temp1); | 834 out[7] = fdct_round_shift(temp1); |
828 out[15] = fdct_round_shift(temp2); | 835 out[15] = fdct_round_shift(temp2); |
829 } | 836 } |
830 | 837 |
831 static void fadst16(const int16_t *input, int16_t *output) { | 838 static void fadst16(const tran_low_t *input, tran_low_t *output) { |
832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; |
| 840 tran_high_t s9, s10, s11, s12, s13, s14, s15; |
833 | 841 |
834 int x0 = input[15]; | 842 tran_high_t x0 = input[15]; |
835 int x1 = input[0]; | 843 tran_high_t x1 = input[0]; |
836 int x2 = input[13]; | 844 tran_high_t x2 = input[13]; |
837 int x3 = input[2]; | 845 tran_high_t x3 = input[2]; |
838 int x4 = input[11]; | 846 tran_high_t x4 = input[11]; |
839 int x5 = input[4]; | 847 tran_high_t x5 = input[4]; |
840 int x6 = input[9]; | 848 tran_high_t x6 = input[9]; |
841 int x7 = input[6]; | 849 tran_high_t x7 = input[6]; |
842 int x8 = input[7]; | 850 tran_high_t x8 = input[7]; |
843 int x9 = input[8]; | 851 tran_high_t x9 = input[8]; |
844 int x10 = input[5]; | 852 tran_high_t x10 = input[5]; |
845 int x11 = input[10]; | 853 tran_high_t x11 = input[10]; |
846 int x12 = input[3]; | 854 tran_high_t x12 = input[3]; |
847 int x13 = input[12]; | 855 tran_high_t x13 = input[12]; |
848 int x14 = input[1]; | 856 tran_high_t x14 = input[1]; |
849 int x15 = input[14]; | 857 tran_high_t x15 = input[14]; |
850 | 858 |
851 // stage 1 | 859 // stage 1 |
852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; | 860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; |
853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | 861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; |
854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; | 862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; |
855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | 863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; |
856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; | 864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; |
857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | 865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; |
858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | 866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; |
859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | 867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; |
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
990 output[15] = - x1; | 998 output[15] = - x1; |
991 } | 999 } |
992 | 1000 |
993 static const transform_2d FHT_16[] = { | 1001 static const transform_2d FHT_16[] = { |
994 { fdct16, fdct16 }, // DCT_DCT = 0 | 1002 { fdct16, fdct16 }, // DCT_DCT = 0 |
995 { fadst16, fdct16 }, // ADST_DCT = 1 | 1003 { fadst16, fdct16 }, // ADST_DCT = 1 |
996 { fdct16, fadst16 }, // DCT_ADST = 2 | 1004 { fdct16, fadst16 }, // DCT_ADST = 2 |
997 { fadst16, fadst16 } // ADST_ADST = 3 | 1005 { fadst16, fadst16 } // ADST_ADST = 3 |
998 }; | 1006 }; |
999 | 1007 |
1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output, | 1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, |
1001 int stride, int tx_type) { | 1009 int stride, int tx_type) { |
1002 if (tx_type == DCT_DCT) { | 1010 if (tx_type == DCT_DCT) { |
1003 vp9_fdct16x16_c(input, output, stride); | 1011 vp9_fdct16x16_c(input, output, stride); |
1004 } else { | 1012 } else { |
1005 int16_t out[256]; | 1013 tran_low_t out[256]; |
1006 int16_t *outptr = &out[0]; | 1014 tran_low_t *outptr = &out[0]; |
1007 int i, j; | 1015 int i, j; |
1008 int16_t temp_in[16], temp_out[16]; | 1016 tran_low_t temp_in[16], temp_out[16]; |
1009 const transform_2d ht = FHT_16[tx_type]; | 1017 const transform_2d ht = FHT_16[tx_type]; |
1010 | 1018 |
1011 // Columns | 1019 // Columns |
1012 for (i = 0; i < 16; ++i) { | 1020 for (i = 0; i < 16; ++i) { |
1013 for (j = 0; j < 16; ++j) | 1021 for (j = 0; j < 16; ++j) |
1014 temp_in[j] = input[j * stride + i] * 4; | 1022 temp_in[j] = input[j * stride + i] * 4; |
1015 ht.cols(temp_in, temp_out); | 1023 ht.cols(temp_in, temp_out); |
1016 for (j = 0; j < 16; ++j) | 1024 for (j = 0; j < 16; ++j) |
1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
1018 } | 1026 } |
1019 | 1027 |
1020 // Rows | 1028 // Rows |
1021 for (i = 0; i < 16; ++i) { | 1029 for (i = 0; i < 16; ++i) { |
1022 for (j = 0; j < 16; ++j) | 1030 for (j = 0; j < 16; ++j) |
1023 temp_in[j] = out[j + i * 16]; | 1031 temp_in[j] = out[j + i * 16]; |
1024 ht.rows(temp_in, temp_out); | 1032 ht.rows(temp_in, temp_out); |
1025 for (j = 0; j < 16; ++j) | 1033 for (j = 0; j < 16; ++j) |
1026 output[j + i * 16] = temp_out[j]; | 1034 output[j + i * 16] = temp_out[j]; |
1027 } | 1035 } |
1028 } | 1036 } |
1029 } | 1037 } |
1030 | 1038 |
1031 static INLINE int dct_32_round(int input) { | 1039 static INLINE tran_high_t dct_32_round(tran_high_t input) { |
1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
1033 assert(-131072 <= rv && rv <= 131071); | 1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert, |
| 1042 // and make the bounds consts. |
| 1043 // assert(-131072 <= rv && rv <= 131071); |
1034 return rv; | 1044 return rv; |
1035 } | 1045 } |
1036 | 1046 |
1037 static INLINE int half_round_shift(int input) { | 1047 static INLINE tran_high_t half_round_shift(tran_high_t input) { |
1038 int rv = (input + 1 + (input < 0)) >> 2; | 1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2; |
1039 return rv; | 1049 return rv; |
1040 } | 1050 } |
1041 | 1051 |
1042 static void fdct32(const int *input, int *output, int round) { | 1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) { |
1043 int step[32]; | 1053 tran_high_t step[32]; |
1044 // Stage 1 | 1054 // Stage 1 |
1045 step[0] = input[0] + input[(32 - 1)]; | 1055 step[0] = input[0] + input[(32 - 1)]; |
1046 step[1] = input[1] + input[(32 - 2)]; | 1056 step[1] = input[1] + input[(32 - 2)]; |
1047 step[2] = input[2] + input[(32 - 3)]; | 1057 step[2] = input[2] + input[(32 - 3)]; |
1048 step[3] = input[3] + input[(32 - 4)]; | 1058 step[3] = input[3] + input[(32 - 4)]; |
1049 step[4] = input[4] + input[(32 - 5)]; | 1059 step[4] = input[4] + input[(32 - 5)]; |
1050 step[5] = input[5] + input[(32 - 6)]; | 1060 step[5] = input[5] + input[(32 - 6)]; |
1051 step[6] = input[6] + input[(32 - 7)]; | 1061 step[6] = input[6] + input[(32 - 7)]; |
1052 step[7] = input[7] + input[(32 - 8)]; | 1062 step[7] = input[7] + input[(32 - 8)]; |
1053 step[8] = input[8] + input[(32 - 9)]; | 1063 step[8] = input[8] + input[(32 - 9)]; |
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); |
1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); |
1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); |
1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); |
1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); |
1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); |
1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); |
1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
1363 } | 1373 } |
1364 | 1374 |
1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { | 1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { |
1366 int r, c; | 1376 int r, c; |
1367 int16_t sum = 0; | 1377 tran_low_t sum = 0; |
1368 for (r = 0; r < 32; ++r) | 1378 for (r = 0; r < 32; ++r) |
1369 for (c = 0; c < 32; ++c) | 1379 for (c = 0; c < 32; ++c) |
1370 sum += input[r * stride + c]; | 1380 sum += input[r * stride + c]; |
1371 | 1381 |
1372 output[0] = sum >> 3; | 1382 output[0] = sum >> 3; |
1373 output[1] = 0; | 1383 output[1] = 0; |
1374 } | 1384 } |
1375 | 1385 |
1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { | 1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
1377 int i, j; | 1387 int i, j; |
1378 int output[32 * 32]; | 1388 tran_high_t output[32 * 32]; |
1379 | 1389 |
1380 // Columns | 1390 // Columns |
1381 for (i = 0; i < 32; ++i) { | 1391 for (i = 0; i < 32; ++i) { |
1382 int temp_in[32], temp_out[32]; | 1392 tran_high_t temp_in[32], temp_out[32]; |
1383 for (j = 0; j < 32; ++j) | 1393 for (j = 0; j < 32; ++j) |
1384 temp_in[j] = input[j * stride + i] * 4; | 1394 temp_in[j] = input[j * stride + i] * 4; |
1385 fdct32(temp_in, temp_out, 0); | 1395 fdct32(temp_in, temp_out, 0); |
1386 for (j = 0; j < 32; ++j) | 1396 for (j = 0; j < 32; ++j) |
1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
1388 } | 1398 } |
1389 | 1399 |
1390 // Rows | 1400 // Rows |
1391 for (i = 0; i < 32; ++i) { | 1401 for (i = 0; i < 32; ++i) { |
1392 int temp_in[32], temp_out[32]; | 1402 tran_high_t temp_in[32], temp_out[32]; |
1393 for (j = 0; j < 32; ++j) | 1403 for (j = 0; j < 32; ++j) |
1394 temp_in[j] = output[j + i * 32]; | 1404 temp_in[j] = output[j + i * 32]; |
1395 fdct32(temp_in, temp_out, 0); | 1405 fdct32(temp_in, temp_out, 0); |
1396 for (j = 0; j < 32; ++j) | 1406 for (j = 0; j < 32; ++j) |
1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
1398 } | 1408 } |
1399 } | 1409 } |
1400 | 1410 |
1401 // Note that although we use dct_32_round in dct32 computation flow, | 1411 // Note that although we use dct_32_round in dct32 computation flow, |
1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating |
1403 // within 16 bits precision. | 1413 // within 16 bits precision. |
1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { | 1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { |
1405 int i, j; | 1415 int i, j; |
1406 int output[32 * 32]; | 1416 tran_high_t output[32 * 32]; |
1407 | 1417 |
1408 // Columns | 1418 // Columns |
1409 for (i = 0; i < 32; ++i) { | 1419 for (i = 0; i < 32; ++i) { |
1410 int temp_in[32], temp_out[32]; | 1420 tran_high_t temp_in[32], temp_out[32]; |
1411 for (j = 0; j < 32; ++j) | 1421 for (j = 0; j < 32; ++j) |
1412 temp_in[j] = input[j * stride + i] * 4; | 1422 temp_in[j] = input[j * stride + i] * 4; |
1413 fdct32(temp_in, temp_out, 0); | 1423 fdct32(temp_in, temp_out, 0); |
1414 for (j = 0; j < 32; ++j) | 1424 for (j = 0; j < 32; ++j) |
1415 // TODO(cd): see quality impact of only doing | 1425 // TODO(cd): see quality impact of only doing |
1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; |
1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c |
1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
1419 } | 1429 } |
1420 | 1430 |
1421 // Rows | 1431 // Rows |
1422 for (i = 0; i < 32; ++i) { | 1432 for (i = 0; i < 32; ++i) { |
1423 int temp_in[32], temp_out[32]; | 1433 tran_high_t temp_in[32], temp_out[32]; |
1424 for (j = 0; j < 32; ++j) | 1434 for (j = 0; j < 32; ++j) |
1425 temp_in[j] = output[j + i * 32]; | 1435 temp_in[j] = output[j + i * 32]; |
1426 fdct32(temp_in, temp_out, 1); | 1436 fdct32(temp_in, temp_out, 1); |
1427 for (j = 0; j < 32; ++j) | 1437 for (j = 0; j < 32; ++j) |
1428 out[j + i * 32] = temp_out[j]; | 1438 out[j + i * 32] = temp_out[j]; |
1429 } | 1439 } |
1430 } | 1440 } |
| 1441 |
| 1442 #if CONFIG_VP9_HIGHBITDEPTH |
| 1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 1444 vp9_fdct4x4_c(input, output, stride); |
| 1445 } |
| 1446 |
| 1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output, |
| 1448 int stride, int tx_type) { |
| 1449 vp9_fht4x4_c(input, output, stride, tx_type); |
| 1450 } |
| 1451 |
| 1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, |
| 1453 int stride) { |
| 1454 vp9_fdct8x8_1_c(input, final_output, stride); |
| 1455 } |
| 1456 |
| 1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output, |
| 1458 int stride) { |
| 1459 vp9_fdct8x8_c(input, final_output, stride); |
| 1460 } |
| 1461 |
| 1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output, |
| 1463 int stride) { |
| 1464 vp9_fdct16x16_1_c(input, output, stride); |
| 1465 } |
| 1466 |
| 1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output, |
| 1468 int stride) { |
| 1469 vp9_fdct16x16_c(input, output, stride); |
| 1470 } |
| 1471 |
| 1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output, |
| 1473 int stride, int tx_type) { |
| 1474 vp9_fht8x8_c(input, output, stride, tx_type); |
| 1475 } |
| 1476 |
| 1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
| 1478 vp9_fwht4x4_c(input, output, stride); |
| 1479 } |
| 1480 |
| 1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output, |
| 1482 int stride, int tx_type) { |
| 1483 vp9_fht16x16_c(input, output, stride, tx_type); |
| 1484 } |
| 1485 |
| 1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1487 vp9_fdct32x32_1_c(input, out, stride); |
| 1488 } |
| 1489 |
| 1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
| 1491 vp9_fdct32x32_c(input, out, stride); |
| 1492 } |
| 1493 |
| 1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, |
| 1495 int stride) { |
| 1496 vp9_fdct32x32_rd_c(input, out, stride); |
| 1497 } |
| 1498 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |