OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 | |
12 #include <assert.h> | 11 #include <assert.h> |
13 #include <math.h> | 12 #include <math.h> |
| 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "vp9/common/vp9_systemdependent.h" | 15 #include "./vp9_rtcd.h" |
16 | 16 |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
| 19 #include "vp9/common/vp9_systemdependent.h" |
19 | 20 |
20 static void fdct4_1d(int16_t *input, int16_t *output) { | 21 #include "vp9/encoder/vp9_dct.h" |
| 22 |
| 23 static void fdct4(const int16_t *input, int16_t *output) { |
21 int16_t step[4]; | 24 int16_t step[4]; |
22 int temp1, temp2; | 25 int temp1, temp2; |
23 | 26 |
24 step[0] = input[0] + input[3]; | 27 step[0] = input[0] + input[3]; |
25 step[1] = input[1] + input[2]; | 28 step[1] = input[1] + input[2]; |
26 step[2] = input[1] - input[2]; | 29 step[2] = input[1] - input[2]; |
27 step[3] = input[0] - input[3]; | 30 step[3] = input[0] - input[3]; |
28 | 31 |
29 temp1 = (step[0] + step[1]) * cospi_16_64; | 32 temp1 = (step[0] + step[1]) * cospi_16_64; |
30 temp2 = (step[0] - step[1]) * cospi_16_64; | 33 temp2 = (step[0] - step[1]) * cospi_16_64; |
31 output[0] = dct_const_round_shift(temp1); | 34 output[0] = dct_const_round_shift(temp1); |
32 output[2] = dct_const_round_shift(temp2); | 35 output[2] = dct_const_round_shift(temp2); |
33 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; | 36 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
34 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; | 37 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
35 output[1] = dct_const_round_shift(temp1); | 38 output[1] = dct_const_round_shift(temp1); |
36 output[3] = dct_const_round_shift(temp2); | 39 output[3] = dct_const_round_shift(temp2); |
37 } | 40 } |
38 | 41 |
39 void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { | 42 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { |
40 // The 2D transform is done with two passes which are actually pretty | 43 // The 2D transform is done with two passes which are actually pretty |
41 // similar. In the first one, we transform the columns and transpose | 44 // similar. In the first one, we transform the columns and transpose |
42 // the results. In the second one, we transform the rows. To achieve that, | 45 // the results. In the second one, we transform the rows. To achieve that, |
43 // as the first pass results are transposed, we tranpose the columns (that | 46 // as the first pass results are transposed, we tranpose the columns (that |
44 // is the transposed rows) and transpose the results (so that it goes back | 47 // is the transposed rows) and transpose the results (so that it goes back |
45 // in normal/row positions). | 48 // in normal/row positions). |
46 const int stride = pitch >> 1; | |
47 int pass; | 49 int pass; |
48 // We need an intermediate buffer between passes. | 50 // We need an intermediate buffer between passes. |
49 int16_t intermediate[4 * 4]; | 51 int16_t intermediate[4 * 4]; |
50 int16_t *in = input; | 52 const int16_t *in = input; |
51 int16_t *out = intermediate; | 53 int16_t *out = intermediate; |
52 // Do the two transform/transpose passes | 54 // Do the two transform/transpose passes |
53 for (pass = 0; pass < 2; ++pass) { | 55 for (pass = 0; pass < 2; ++pass) { |
54 /*canbe16*/ int input[4]; | 56 /*canbe16*/ int input[4]; |
55 /*canbe16*/ int step[4]; | 57 /*canbe16*/ int step[4]; |
56 /*needs32*/ int temp1, temp2; | 58 /*needs32*/ int temp1, temp2; |
57 int i; | 59 int i; |
58 for (i = 0; i < 4; ++i) { | 60 for (i = 0; i < 4; ++i) { |
59 // Load inputs. | 61 // Load inputs. |
60 if (0 == pass) { | 62 if (0 == pass) { |
61 input[0] = in[0 * stride] << 4; | 63 input[0] = in[0 * stride] * 16; |
62 input[1] = in[1 * stride] << 4; | 64 input[1] = in[1 * stride] * 16; |
63 input[2] = in[2 * stride] << 4; | 65 input[2] = in[2 * stride] * 16; |
64 input[3] = in[3 * stride] << 4; | 66 input[3] = in[3 * stride] * 16; |
65 if (i == 0 && input[0]) { | 67 if (i == 0 && input[0]) { |
66 input[0] += 1; | 68 input[0] += 1; |
67 } | 69 } |
68 } else { | 70 } else { |
69 input[0] = in[0 * 4]; | 71 input[0] = in[0 * 4]; |
70 input[1] = in[1 * 4]; | 72 input[1] = in[1 * 4]; |
71 input[2] = in[2 * 4]; | 73 input[2] = in[2 * 4]; |
72 input[3] = in[3 * 4]; | 74 input[3] = in[3 * 4]; |
73 } | 75 } |
74 // Transform. | 76 // Transform. |
(...skipping 20 matching lines...) Expand all Loading... |
95 | 97 |
96 { | 98 { |
97 int i, j; | 99 int i, j; |
98 for (i = 0; i < 4; ++i) { | 100 for (i = 0; i < 4; ++i) { |
99 for (j = 0; j < 4; ++j) | 101 for (j = 0; j < 4; ++j) |
100 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; | 102 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; |
101 } | 103 } |
102 } | 104 } |
103 } | 105 } |
104 | 106 |
105 static void fadst4_1d(int16_t *input, int16_t *output) { | 107 static void fadst4(const int16_t *input, int16_t *output) { |
106 int x0, x1, x2, x3; | 108 int x0, x1, x2, x3; |
107 int s0, s1, s2, s3, s4, s5, s6, s7; | 109 int s0, s1, s2, s3, s4, s5, s6, s7; |
108 | 110 |
109 x0 = input[0]; | 111 x0 = input[0]; |
110 x1 = input[1]; | 112 x1 = input[1]; |
111 x2 = input[2]; | 113 x2 = input[2]; |
112 x3 = input[3]; | 114 x3 = input[3]; |
113 | 115 |
114 if (!(x0 | x1 | x2 | x3)) { | 116 if (!(x0 | x1 | x2 | x3)) { |
115 output[0] = output[1] = output[2] = output[3] = 0; | 117 output[0] = output[1] = output[2] = output[3] = 0; |
(...skipping 20 matching lines...) Expand all Loading... |
136 s3 = x2 - x0 + x3; | 138 s3 = x2 - x0 + x3; |
137 | 139 |
138 // 1-D transform scaling factor is sqrt(2). | 140 // 1-D transform scaling factor is sqrt(2). |
139 output[0] = dct_const_round_shift(s0); | 141 output[0] = dct_const_round_shift(s0); |
140 output[1] = dct_const_round_shift(s1); | 142 output[1] = dct_const_round_shift(s1); |
141 output[2] = dct_const_round_shift(s2); | 143 output[2] = dct_const_round_shift(s2); |
142 output[3] = dct_const_round_shift(s3); | 144 output[3] = dct_const_round_shift(s3); |
143 } | 145 } |
144 | 146 |
145 static const transform_2d FHT_4[] = { | 147 static const transform_2d FHT_4[] = { |
146 { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 | 148 { fdct4, fdct4 }, // DCT_DCT = 0 |
147 { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 | 149 { fadst4, fdct4 }, // ADST_DCT = 1 |
148 { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 | 150 { fdct4, fadst4 }, // DCT_ADST = 2 |
149 { fadst4_1d, fadst4_1d } // ADST_ADST = 3 | 151 { fadst4, fadst4 } // ADST_ADST = 3 |
150 }; | 152 }; |
151 | 153 |
152 void vp9_short_fht4x4_c(int16_t *input, int16_t *output, | 154 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, |
153 int pitch, TX_TYPE tx_type) { | 155 int stride, int tx_type) { |
154 int16_t out[4 * 4]; | 156 int16_t out[4 * 4]; |
155 int16_t *outptr = &out[0]; | 157 int16_t *outptr = &out[0]; |
156 int i, j; | 158 int i, j; |
157 int16_t temp_in[4], temp_out[4]; | 159 int16_t temp_in[4], temp_out[4]; |
158 const transform_2d ht = FHT_4[tx_type]; | 160 const transform_2d ht = FHT_4[tx_type]; |
159 | 161 |
160 // Columns | 162 // Columns |
161 for (i = 0; i < 4; ++i) { | 163 for (i = 0; i < 4; ++i) { |
162 for (j = 0; j < 4; ++j) | 164 for (j = 0; j < 4; ++j) |
163 temp_in[j] = input[j * pitch + i] << 4; | 165 temp_in[j] = input[j * stride + i] * 16; |
164 if (i == 0 && temp_in[0]) | 166 if (i == 0 && temp_in[0]) |
165 temp_in[0] += 1; | 167 temp_in[0] += 1; |
166 ht.cols(temp_in, temp_out); | 168 ht.cols(temp_in, temp_out); |
167 for (j = 0; j < 4; ++j) | 169 for (j = 0; j < 4; ++j) |
168 outptr[j * 4 + i] = temp_out[j]; | 170 outptr[j * 4 + i] = temp_out[j]; |
169 } | 171 } |
170 | 172 |
171 // Rows | 173 // Rows |
172 for (i = 0; i < 4; ++i) { | 174 for (i = 0; i < 4; ++i) { |
173 for (j = 0; j < 4; ++j) | 175 for (j = 0; j < 4; ++j) |
174 temp_in[j] = out[j + i * 4]; | 176 temp_in[j] = out[j + i * 4]; |
175 ht.rows(temp_in, temp_out); | 177 ht.rows(temp_in, temp_out); |
176 for (j = 0; j < 4; ++j) | 178 for (j = 0; j < 4; ++j) |
177 output[j + i * 4] = (temp_out[j] + 1) >> 2; | 179 output[j + i * 4] = (temp_out[j] + 1) >> 2; |
178 } | 180 } |
179 } | 181 } |
180 | 182 |
181 void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { | 183 static void fdct8(const int16_t *input, int16_t *output) { |
182 vp9_short_fdct4x4_c(input, output, pitch); | |
183 vp9_short_fdct4x4_c(input + 4, output + 16, pitch); | |
184 } | |
185 | |
186 static void fdct8_1d(int16_t *input, int16_t *output) { | |
187 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 184 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
188 /*needs32*/ int t0, t1, t2, t3; | 185 /*needs32*/ int t0, t1, t2, t3; |
189 /*canbe16*/ int x0, x1, x2, x3; | 186 /*canbe16*/ int x0, x1, x2, x3; |
190 | 187 |
191 // stage 1 | 188 // stage 1 |
192 s0 = input[0] + input[7]; | 189 s0 = input[0] + input[7]; |
193 s1 = input[1] + input[6]; | 190 s1 = input[1] + input[6]; |
194 s2 = input[2] + input[5]; | 191 s2 = input[2] + input[5]; |
195 s3 = input[3] + input[4]; | 192 s3 = input[3] + input[4]; |
196 s4 = input[3] - input[4]; | 193 s4 = input[3] - input[4]; |
197 s5 = input[2] - input[5]; | 194 s5 = input[2] - input[5]; |
198 s6 = input[1] - input[6]; | 195 s6 = input[1] - input[6]; |
199 s7 = input[0] - input[7]; | 196 s7 = input[0] - input[7]; |
200 | 197 |
201 // fdct4_1d(step, step); | 198 // fdct4(step, step); |
202 x0 = s0 + s3; | 199 x0 = s0 + s3; |
203 x1 = s1 + s2; | 200 x1 = s1 + s2; |
204 x2 = s1 - s2; | 201 x2 = s1 - s2; |
205 x3 = s0 - s3; | 202 x3 = s0 - s3; |
206 t0 = (x0 + x1) * cospi_16_64; | 203 t0 = (x0 + x1) * cospi_16_64; |
207 t1 = (x0 - x1) * cospi_16_64; | 204 t1 = (x0 - x1) * cospi_16_64; |
208 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; | 205 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
209 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; | 206 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
210 output[0] = dct_const_round_shift(t0); | 207 output[0] = dct_const_round_shift(t0); |
211 output[2] = dct_const_round_shift(t2); | 208 output[2] = dct_const_round_shift(t2); |
(...skipping 16 matching lines...) Expand all Loading... |
228 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; | 225 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
229 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; | 226 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
230 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; | 227 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
231 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; | 228 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
232 output[1] = dct_const_round_shift(t0); | 229 output[1] = dct_const_round_shift(t0); |
233 output[3] = dct_const_round_shift(t2); | 230 output[3] = dct_const_round_shift(t2); |
234 output[5] = dct_const_round_shift(t1); | 231 output[5] = dct_const_round_shift(t1); |
235 output[7] = dct_const_round_shift(t3); | 232 output[7] = dct_const_round_shift(t3); |
236 } | 233 } |
237 | 234 |
238 void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { | 235 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { |
239 const int stride = pitch >> 1; | |
240 int i, j; | 236 int i, j; |
241 int16_t intermediate[64]; | 237 int16_t intermediate[64]; |
242 | 238 |
243 // Transform columns | 239 // Transform columns |
244 { | 240 { |
245 int16_t *output = intermediate; | 241 int16_t *output = intermediate; |
246 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 242 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
247 /*needs32*/ int t0, t1, t2, t3; | 243 /*needs32*/ int t0, t1, t2, t3; |
248 /*canbe16*/ int x0, x1, x2, x3; | 244 /*canbe16*/ int x0, x1, x2, x3; |
249 | 245 |
250 int i; | 246 int i; |
251 for (i = 0; i < 8; i++) { | 247 for (i = 0; i < 8; i++) { |
252 // stage 1 | 248 // stage 1 |
253 s0 = (input[0 * stride] + input[7 * stride]) << 2; | 249 s0 = (input[0 * stride] + input[7 * stride]) * 4; |
254 s1 = (input[1 * stride] + input[6 * stride]) << 2; | 250 s1 = (input[1 * stride] + input[6 * stride]) * 4; |
255 s2 = (input[2 * stride] + input[5 * stride]) << 2; | 251 s2 = (input[2 * stride] + input[5 * stride]) * 4; |
256 s3 = (input[3 * stride] + input[4 * stride]) << 2; | 252 s3 = (input[3 * stride] + input[4 * stride]) * 4; |
257 s4 = (input[3 * stride] - input[4 * stride]) << 2; | 253 s4 = (input[3 * stride] - input[4 * stride]) * 4; |
258 s5 = (input[2 * stride] - input[5 * stride]) << 2; | 254 s5 = (input[2 * stride] - input[5 * stride]) * 4; |
259 s6 = (input[1 * stride] - input[6 * stride]) << 2; | 255 s6 = (input[1 * stride] - input[6 * stride]) * 4; |
260 s7 = (input[0 * stride] - input[7 * stride]) << 2; | 256 s7 = (input[0 * stride] - input[7 * stride]) * 4; |
261 | 257 |
262 // fdct4_1d(step, step); | 258 // fdct4(step, step); |
263 x0 = s0 + s3; | 259 x0 = s0 + s3; |
264 x1 = s1 + s2; | 260 x1 = s1 + s2; |
265 x2 = s1 - s2; | 261 x2 = s1 - s2; |
266 x3 = s0 - s3; | 262 x3 = s0 - s3; |
267 t0 = (x0 + x1) * cospi_16_64; | 263 t0 = (x0 + x1) * cospi_16_64; |
268 t1 = (x0 - x1) * cospi_16_64; | 264 t1 = (x0 - x1) * cospi_16_64; |
269 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; | 265 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
270 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; | 266 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
271 output[0 * 8] = dct_const_round_shift(t0); | 267 output[0 * 8] = dct_const_round_shift(t0); |
272 output[2 * 8] = dct_const_round_shift(t2); | 268 output[2 * 8] = dct_const_round_shift(t2); |
(...skipping 21 matching lines...) Expand all Loading... |
294 output[3 * 8] = dct_const_round_shift(t2); | 290 output[3 * 8] = dct_const_round_shift(t2); |
295 output[5 * 8] = dct_const_round_shift(t1); | 291 output[5 * 8] = dct_const_round_shift(t1); |
296 output[7 * 8] = dct_const_round_shift(t3); | 292 output[7 * 8] = dct_const_round_shift(t3); |
297 input++; | 293 input++; |
298 output++; | 294 output++; |
299 } | 295 } |
300 } | 296 } |
301 | 297 |
302 // Rows | 298 // Rows |
303 for (i = 0; i < 8; ++i) { | 299 for (i = 0; i < 8; ++i) { |
304 fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); | 300 fdct8(&intermediate[i * 8], &final_output[i * 8]); |
305 for (j = 0; j < 8; ++j) | 301 for (j = 0; j < 8; ++j) |
306 final_output[j + i * 8] /= 2; | 302 final_output[j + i * 8] /= 2; |
307 } | 303 } |
308 } | 304 } |
309 | 305 |
310 void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { | 306 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { |
311 // The 2D transform is done with two passes which are actually pretty | 307 // The 2D transform is done with two passes which are actually pretty |
312 // similar. In the first one, we transform the columns and transpose | 308 // similar. In the first one, we transform the columns and transpose |
313 // the results. In the second one, we transform the rows. To achieve that, | 309 // the results. In the second one, we transform the rows. To achieve that, |
314 // as the first pass results are transposed, we tranpose the columns (that | 310 // as the first pass results are transposed, we tranpose the columns (that |
315 // is the transposed rows) and transpose the results (so that it goes back | 311 // is the transposed rows) and transpose the results (so that it goes back |
316 // in normal/row positions). | 312 // in normal/row positions). |
317 const int stride = pitch >> 1; | |
318 int pass; | 313 int pass; |
319 // We need an intermediate buffer between passes. | 314 // We need an intermediate buffer between passes. |
320 int16_t intermediate[256]; | 315 int16_t intermediate[256]; |
321 int16_t *in = input; | 316 const int16_t *in = input; |
322 int16_t *out = intermediate; | 317 int16_t *out = intermediate; |
323 // Do the two transform/transpose passes | 318 // Do the two transform/transpose passes |
324 for (pass = 0; pass < 2; ++pass) { | 319 for (pass = 0; pass < 2; ++pass) { |
325 /*canbe16*/ int step1[8]; | 320 /*canbe16*/ int step1[8]; |
326 /*canbe16*/ int step2[8]; | 321 /*canbe16*/ int step2[8]; |
327 /*canbe16*/ int step3[8]; | 322 /*canbe16*/ int step3[8]; |
328 /*canbe16*/ int input[8]; | 323 /*canbe16*/ int input[8]; |
329 /*needs32*/ int temp1, temp2; | 324 /*needs32*/ int temp1, temp2; |
330 int i; | 325 int i; |
331 for (i = 0; i < 16; i++) { | 326 for (i = 0; i < 16; i++) { |
332 if (0 == pass) { | 327 if (0 == pass) { |
333 // Calculate input for the first 8 results. | 328 // Calculate input for the first 8 results. |
334 input[0] = (in[0 * stride] + in[15 * stride]) << 2; | 329 input[0] = (in[0 * stride] + in[15 * stride]) * 4; |
335 input[1] = (in[1 * stride] + in[14 * stride]) << 2; | 330 input[1] = (in[1 * stride] + in[14 * stride]) * 4; |
336 input[2] = (in[2 * stride] + in[13 * stride]) << 2; | 331 input[2] = (in[2 * stride] + in[13 * stride]) * 4; |
337 input[3] = (in[3 * stride] + in[12 * stride]) << 2; | 332 input[3] = (in[3 * stride] + in[12 * stride]) * 4; |
338 input[4] = (in[4 * stride] + in[11 * stride]) << 2; | 333 input[4] = (in[4 * stride] + in[11 * stride]) * 4; |
339 input[5] = (in[5 * stride] + in[10 * stride]) << 2; | 334 input[5] = (in[5 * stride] + in[10 * stride]) * 4; |
340 input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; | 335 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; |
341 input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; | 336 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; |
342 // Calculate input for the next 8 results. | 337 // Calculate input for the next 8 results. |
343 step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; | 338 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; |
344 step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; | 339 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; |
345 step1[2] = (in[5 * stride] - in[10 * stride]) << 2; | 340 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; |
346 step1[3] = (in[4 * stride] - in[11 * stride]) << 2; | 341 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; |
347 step1[4] = (in[3 * stride] - in[12 * stride]) << 2; | 342 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; |
348 step1[5] = (in[2 * stride] - in[13 * stride]) << 2; | 343 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; |
349 step1[6] = (in[1 * stride] - in[14 * stride]) << 2; | 344 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; |
350 step1[7] = (in[0 * stride] - in[15 * stride]) << 2; | 345 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; |
351 } else { | 346 } else { |
352 // Calculate input for the first 8 results. | 347 // Calculate input for the first 8 results. |
353 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); | 348 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); |
354 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); | 349 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); |
355 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); | 350 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); |
356 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); | 351 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); |
357 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); | 352 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); |
358 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); | 353 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); |
359 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); | 354 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); |
360 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); | 355 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); |
361 // Calculate input for the next 8 results. | 356 // Calculate input for the next 8 results. |
362 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); | 357 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); |
363 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); | 358 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); |
364 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); | 359 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); |
365 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); | 360 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); |
366 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); | 361 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); |
367 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); | 362 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); |
368 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); | 363 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); |
369 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); | 364 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); |
370 } | 365 } |
371 // Work on the first eight values; fdct8_1d(input, even_results); | 366 // Work on the first eight values; fdct8(input, even_results); |
372 { | 367 { |
373 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 368 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
374 /*needs32*/ int t0, t1, t2, t3; | 369 /*needs32*/ int t0, t1, t2, t3; |
375 /*canbe16*/ int x0, x1, x2, x3; | 370 /*canbe16*/ int x0, x1, x2, x3; |
376 | 371 |
377 // stage 1 | 372 // stage 1 |
378 s0 = input[0] + input[7]; | 373 s0 = input[0] + input[7]; |
379 s1 = input[1] + input[6]; | 374 s1 = input[1] + input[6]; |
380 s2 = input[2] + input[5]; | 375 s2 = input[2] + input[5]; |
381 s3 = input[3] + input[4]; | 376 s3 = input[3] + input[4]; |
382 s4 = input[3] - input[4]; | 377 s4 = input[3] - input[4]; |
383 s5 = input[2] - input[5]; | 378 s5 = input[2] - input[5]; |
384 s6 = input[1] - input[6]; | 379 s6 = input[1] - input[6]; |
385 s7 = input[0] - input[7]; | 380 s7 = input[0] - input[7]; |
386 | 381 |
387 // fdct4_1d(step, step); | 382 // fdct4(step, step); |
388 x0 = s0 + s3; | 383 x0 = s0 + s3; |
389 x1 = s1 + s2; | 384 x1 = s1 + s2; |
390 x2 = s1 - s2; | 385 x2 = s1 - s2; |
391 x3 = s0 - s3; | 386 x3 = s0 - s3; |
392 t0 = (x0 + x1) * cospi_16_64; | 387 t0 = (x0 + x1) * cospi_16_64; |
393 t1 = (x0 - x1) * cospi_16_64; | 388 t1 = (x0 - x1) * cospi_16_64; |
394 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; | 389 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
395 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; | 390 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
396 out[0] = dct_const_round_shift(t0); | 391 out[0] = dct_const_round_shift(t0); |
397 out[4] = dct_const_round_shift(t2); | 392 out[4] = dct_const_round_shift(t2); |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
479 // Do next column (which is a transposed row in second/horizontal pass) | 474 // Do next column (which is a transposed row in second/horizontal pass) |
480 in++; | 475 in++; |
481 out += 16; | 476 out += 16; |
482 } | 477 } |
483 // Setup in/out for next pass. | 478 // Setup in/out for next pass. |
484 in = intermediate; | 479 in = intermediate; |
485 out = output; | 480 out = output; |
486 } | 481 } |
487 } | 482 } |
488 | 483 |
489 static void fadst8_1d(int16_t *input, int16_t *output) { | 484 static void fadst8(const int16_t *input, int16_t *output) { |
490 int s0, s1, s2, s3, s4, s5, s6, s7; | 485 int s0, s1, s2, s3, s4, s5, s6, s7; |
491 | 486 |
492 int x0 = input[7]; | 487 int x0 = input[7]; |
493 int x1 = input[0]; | 488 int x1 = input[0]; |
494 int x2 = input[5]; | 489 int x2 = input[5]; |
495 int x3 = input[2]; | 490 int x3 = input[2]; |
496 int x4 = input[3]; | 491 int x4 = input[3]; |
497 int x5 = input[4]; | 492 int x5 = input[4]; |
498 int x6 = input[1]; | 493 int x6 = input[1]; |
499 int x7 = input[6]; | 494 int x7 = input[6]; |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
551 output[1] = - x4; | 546 output[1] = - x4; |
552 output[2] = x6; | 547 output[2] = x6; |
553 output[3] = - x2; | 548 output[3] = - x2; |
554 output[4] = x3; | 549 output[4] = x3; |
555 output[5] = - x7; | 550 output[5] = - x7; |
556 output[6] = x5; | 551 output[6] = x5; |
557 output[7] = - x1; | 552 output[7] = - x1; |
558 } | 553 } |
559 | 554 |
560 static const transform_2d FHT_8[] = { | 555 static const transform_2d FHT_8[] = { |
561 { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 | 556 { fdct8, fdct8 }, // DCT_DCT = 0 |
562 { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 | 557 { fadst8, fdct8 }, // ADST_DCT = 1 |
563 { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 | 558 { fdct8, fadst8 }, // DCT_ADST = 2 |
564 { fadst8_1d, fadst8_1d } // ADST_ADST = 3 | 559 { fadst8, fadst8 } // ADST_ADST = 3 |
565 }; | 560 }; |
566 | 561 |
567 void vp9_short_fht8x8_c(int16_t *input, int16_t *output, | 562 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, |
568 int pitch, TX_TYPE tx_type) { | 563 int stride, int tx_type) { |
569 int16_t out[64]; | 564 int16_t out[64]; |
570 int16_t *outptr = &out[0]; | 565 int16_t *outptr = &out[0]; |
571 int i, j; | 566 int i, j; |
572 int16_t temp_in[8], temp_out[8]; | 567 int16_t temp_in[8], temp_out[8]; |
573 const transform_2d ht = FHT_8[tx_type]; | 568 const transform_2d ht = FHT_8[tx_type]; |
574 | 569 |
575 // Columns | 570 // Columns |
576 for (i = 0; i < 8; ++i) { | 571 for (i = 0; i < 8; ++i) { |
577 for (j = 0; j < 8; ++j) | 572 for (j = 0; j < 8; ++j) |
578 temp_in[j] = input[j * pitch + i] << 2; | 573 temp_in[j] = input[j * stride + i] * 4; |
579 ht.cols(temp_in, temp_out); | 574 ht.cols(temp_in, temp_out); |
580 for (j = 0; j < 8; ++j) | 575 for (j = 0; j < 8; ++j) |
581 outptr[j * 8 + i] = temp_out[j]; | 576 outptr[j * 8 + i] = temp_out[j]; |
582 } | 577 } |
583 | 578 |
584 // Rows | 579 // Rows |
585 for (i = 0; i < 8; ++i) { | 580 for (i = 0; i < 8; ++i) { |
586 for (j = 0; j < 8; ++j) | 581 for (j = 0; j < 8; ++j) |
587 temp_in[j] = out[j + i * 8]; | 582 temp_in[j] = out[j + i * 8]; |
588 ht.rows(temp_in, temp_out); | 583 ht.rows(temp_in, temp_out); |
589 for (j = 0; j < 8; ++j) | 584 for (j = 0; j < 8; ++j) |
590 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 585 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
591 } | 586 } |
592 } | 587 } |
593 | 588 |
594 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per | 589 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
595 pixel. */ | 590 pixel. */ |
596 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { | 591 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { |
597 int i; | 592 int i; |
598 int a1, b1, c1, d1, e1; | 593 int a1, b1, c1, d1, e1; |
599 short *ip = input; | 594 const int16_t *ip = input; |
600 short *op = output; | 595 int16_t *op = output; |
601 int pitch_short = pitch >> 1; | |
602 | 596 |
603 for (i = 0; i < 4; i++) { | 597 for (i = 0; i < 4; i++) { |
604 a1 = ip[0 * pitch_short]; | 598 a1 = ip[0 * stride]; |
605 b1 = ip[1 * pitch_short]; | 599 b1 = ip[1 * stride]; |
606 c1 = ip[2 * pitch_short]; | 600 c1 = ip[2 * stride]; |
607 d1 = ip[3 * pitch_short]; | 601 d1 = ip[3 * stride]; |
608 | 602 |
609 a1 += b1; | 603 a1 += b1; |
610 d1 = d1 - c1; | 604 d1 = d1 - c1; |
611 e1 = (a1 - d1) >> 1; | 605 e1 = (a1 - d1) >> 1; |
612 b1 = e1 - b1; | 606 b1 = e1 - b1; |
613 c1 = e1 - c1; | 607 c1 = e1 - c1; |
614 a1 -= c1; | 608 a1 -= c1; |
615 d1 += b1; | 609 d1 += b1; |
616 op[0] = a1; | 610 op[0] = a1; |
617 op[4] = c1; | 611 op[4] = c1; |
(...skipping 12 matching lines...) Expand all Loading... |
630 c1 = ip[2]; | 624 c1 = ip[2]; |
631 d1 = ip[3]; | 625 d1 = ip[3]; |
632 | 626 |
633 a1 += b1; | 627 a1 += b1; |
634 d1 -= c1; | 628 d1 -= c1; |
635 e1 = (a1 - d1) >> 1; | 629 e1 = (a1 - d1) >> 1; |
636 b1 = e1 - b1; | 630 b1 = e1 - b1; |
637 c1 = e1 - c1; | 631 c1 = e1 - c1; |
638 a1 -= c1; | 632 a1 -= c1; |
639 d1 += b1; | 633 d1 += b1; |
640 op[0] = a1 << WHT_UPSCALE_FACTOR; | 634 op[0] = a1 * UNIT_QUANT_FACTOR; |
641 op[1] = c1 << WHT_UPSCALE_FACTOR; | 635 op[1] = c1 * UNIT_QUANT_FACTOR; |
642 op[2] = d1 << WHT_UPSCALE_FACTOR; | 636 op[2] = d1 * UNIT_QUANT_FACTOR; |
643 op[3] = b1 << WHT_UPSCALE_FACTOR; | 637 op[3] = b1 * UNIT_QUANT_FACTOR; |
644 | 638 |
645 ip += 4; | 639 ip += 4; |
646 op += 4; | 640 op += 4; |
647 } | 641 } |
648 } | 642 } |
649 | 643 |
650 void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { | |
651 vp9_short_walsh4x4_c(input, output, pitch); | |
652 vp9_short_walsh4x4_c(input + 4, output + 16, pitch); | |
653 } | |
654 | |
655 | |
656 // Rewrote to use same algorithm as others. | 644 // Rewrote to use same algorithm as others. |
657 static void fdct16_1d(int16_t in[16], int16_t out[16]) { | 645 static void fdct16(const int16_t in[16], int16_t out[16]) { |
658 /*canbe16*/ int step1[8]; | 646 /*canbe16*/ int step1[8]; |
659 /*canbe16*/ int step2[8]; | 647 /*canbe16*/ int step2[8]; |
660 /*canbe16*/ int step3[8]; | 648 /*canbe16*/ int step3[8]; |
661 /*canbe16*/ int input[8]; | 649 /*canbe16*/ int input[8]; |
662 /*needs32*/ int temp1, temp2; | 650 /*needs32*/ int temp1, temp2; |
663 | 651 |
664 // step 1 | 652 // step 1 |
665 input[0] = in[0] + in[15]; | 653 input[0] = in[0] + in[15]; |
666 input[1] = in[1] + in[14]; | 654 input[1] = in[1] + in[14]; |
667 input[2] = in[2] + in[13]; | 655 input[2] = in[2] + in[13]; |
668 input[3] = in[3] + in[12]; | 656 input[3] = in[3] + in[12]; |
669 input[4] = in[4] + in[11]; | 657 input[4] = in[4] + in[11]; |
670 input[5] = in[5] + in[10]; | 658 input[5] = in[5] + in[10]; |
671 input[6] = in[6] + in[ 9]; | 659 input[6] = in[6] + in[ 9]; |
672 input[7] = in[7] + in[ 8]; | 660 input[7] = in[7] + in[ 8]; |
673 | 661 |
674 step1[0] = in[7] - in[ 8]; | 662 step1[0] = in[7] - in[ 8]; |
675 step1[1] = in[6] - in[ 9]; | 663 step1[1] = in[6] - in[ 9]; |
676 step1[2] = in[5] - in[10]; | 664 step1[2] = in[5] - in[10]; |
677 step1[3] = in[4] - in[11]; | 665 step1[3] = in[4] - in[11]; |
678 step1[4] = in[3] - in[12]; | 666 step1[4] = in[3] - in[12]; |
679 step1[5] = in[2] - in[13]; | 667 step1[5] = in[2] - in[13]; |
680 step1[6] = in[1] - in[14]; | 668 step1[6] = in[1] - in[14]; |
681 step1[7] = in[0] - in[15]; | 669 step1[7] = in[0] - in[15]; |
682 | 670 |
683 // fdct8_1d(step, step); | 671 // fdct8(step, step); |
684 { | 672 { |
685 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; | 673 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
686 /*needs32*/ int t0, t1, t2, t3; | 674 /*needs32*/ int t0, t1, t2, t3; |
687 /*canbe16*/ int x0, x1, x2, x3; | 675 /*canbe16*/ int x0, x1, x2, x3; |
688 | 676 |
689 // stage 1 | 677 // stage 1 |
690 s0 = input[0] + input[7]; | 678 s0 = input[0] + input[7]; |
691 s1 = input[1] + input[6]; | 679 s1 = input[1] + input[6]; |
692 s2 = input[2] + input[5]; | 680 s2 = input[2] + input[5]; |
693 s3 = input[3] + input[4]; | 681 s3 = input[3] + input[4]; |
694 s4 = input[3] - input[4]; | 682 s4 = input[3] - input[4]; |
695 s5 = input[2] - input[5]; | 683 s5 = input[2] - input[5]; |
696 s6 = input[1] - input[6]; | 684 s6 = input[1] - input[6]; |
697 s7 = input[0] - input[7]; | 685 s7 = input[0] - input[7]; |
698 | 686 |
699 // fdct4_1d(step, step); | 687 // fdct4(step, step); |
700 x0 = s0 + s3; | 688 x0 = s0 + s3; |
701 x1 = s1 + s2; | 689 x1 = s1 + s2; |
702 x2 = s1 - s2; | 690 x2 = s1 - s2; |
703 x3 = s0 - s3; | 691 x3 = s0 - s3; |
704 t0 = (x0 + x1) * cospi_16_64; | 692 t0 = (x0 + x1) * cospi_16_64; |
705 t1 = (x0 - x1) * cospi_16_64; | 693 t1 = (x0 - x1) * cospi_16_64; |
706 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; | 694 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
707 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; | 695 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
708 out[0] = dct_const_round_shift(t0); | 696 out[0] = dct_const_round_shift(t0); |
709 out[4] = dct_const_round_shift(t2); | 697 out[4] = dct_const_round_shift(t2); |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
788 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; | 776 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
789 out[3] = dct_const_round_shift(temp1); | 777 out[3] = dct_const_round_shift(temp1); |
790 out[11] = dct_const_round_shift(temp2); | 778 out[11] = dct_const_round_shift(temp2); |
791 | 779 |
792 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; | 780 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
793 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; | 781 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
794 out[7] = dct_const_round_shift(temp1); | 782 out[7] = dct_const_round_shift(temp1); |
795 out[15] = dct_const_round_shift(temp2); | 783 out[15] = dct_const_round_shift(temp2); |
796 } | 784 } |
797 | 785 |
798 void fadst16_1d(int16_t *input, int16_t *output) { | 786 static void fadst16(const int16_t *input, int16_t *output) { |
799 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 787 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
800 | 788 |
801 int x0 = input[15]; | 789 int x0 = input[15]; |
802 int x1 = input[0]; | 790 int x1 = input[0]; |
803 int x2 = input[13]; | 791 int x2 = input[13]; |
804 int x3 = input[2]; | 792 int x3 = input[2]; |
805 int x4 = input[11]; | 793 int x4 = input[11]; |
806 int x5 = input[4]; | 794 int x5 = input[4]; |
807 int x6 = input[9]; | 795 int x6 = input[9]; |
808 int x7 = input[6]; | 796 int x7 = input[6]; |
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
951 output[9] = x11; | 939 output[9] = x11; |
952 output[10] = x15; | 940 output[10] = x15; |
953 output[11] = x7; | 941 output[11] = x7; |
954 output[12] = x5; | 942 output[12] = x5; |
955 output[13] = - x13; | 943 output[13] = - x13; |
956 output[14] = x9; | 944 output[14] = x9; |
957 output[15] = - x1; | 945 output[15] = - x1; |
958 } | 946 } |
959 | 947 |
960 static const transform_2d FHT_16[] = { | 948 static const transform_2d FHT_16[] = { |
961 { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 | 949 { fdct16, fdct16 }, // DCT_DCT = 0 |
962 { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 | 950 { fadst16, fdct16 }, // ADST_DCT = 1 |
963 { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 | 951 { fdct16, fadst16 }, // DCT_ADST = 2 |
964 { fadst16_1d, fadst16_1d } // ADST_ADST = 3 | 952 { fadst16, fadst16 } // ADST_ADST = 3 |
965 }; | 953 }; |
966 | 954 |
967 void vp9_short_fht16x16_c(int16_t *input, int16_t *output, | 955 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, |
968 int pitch, TX_TYPE tx_type) { | 956 int stride, int tx_type) { |
969 int16_t out[256]; | 957 int16_t out[256]; |
970 int16_t *outptr = &out[0]; | 958 int16_t *outptr = &out[0]; |
971 int i, j; | 959 int i, j; |
972 int16_t temp_in[16], temp_out[16]; | 960 int16_t temp_in[16], temp_out[16]; |
973 const transform_2d ht = FHT_16[tx_type]; | 961 const transform_2d ht = FHT_16[tx_type]; |
974 | 962 |
975 // Columns | 963 // Columns |
976 for (i = 0; i < 16; ++i) { | 964 for (i = 0; i < 16; ++i) { |
977 for (j = 0; j < 16; ++j) | 965 for (j = 0; j < 16; ++j) |
978 temp_in[j] = input[j * pitch + i] << 2; | 966 temp_in[j] = input[j * stride + i] * 4; |
979 ht.cols(temp_in, temp_out); | 967 ht.cols(temp_in, temp_out); |
980 for (j = 0; j < 16; ++j) | 968 for (j = 0; j < 16; ++j) |
981 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 969 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
982 // outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 970 // outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
983 } | 971 } |
984 | 972 |
985 // Rows | 973 // Rows |
986 for (i = 0; i < 16; ++i) { | 974 for (i = 0; i < 16; ++i) { |
987 for (j = 0; j < 16; ++j) | 975 for (j = 0; j < 16; ++j) |
988 temp_in[j] = out[j + i * 16]; | 976 temp_in[j] = out[j + i * 16]; |
989 ht.rows(temp_in, temp_out); | 977 ht.rows(temp_in, temp_out); |
990 for (j = 0; j < 16; ++j) | 978 for (j = 0; j < 16; ++j) |
991 output[j + i * 16] = temp_out[j]; | 979 output[j + i * 16] = temp_out[j]; |
992 } | 980 } |
993 } | 981 } |
994 | 982 |
995 static INLINE int dct_32_round(int input) { | 983 static INLINE int dct_32_round(int input) { |
996 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); | 984 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
997 assert(-131072 <= rv && rv <= 131071); | 985 assert(-131072 <= rv && rv <= 131071); |
998 return rv; | 986 return rv; |
999 } | 987 } |
1000 | 988 |
1001 static INLINE int half_round_shift(int input) { | 989 static INLINE int half_round_shift(int input) { |
1002 int rv = (input + 1 + (input < 0)) >> 2; | 990 int rv = (input + 1 + (input < 0)) >> 2; |
1003 return rv; | 991 return rv; |
1004 } | 992 } |
1005 | 993 |
1006 static void dct32_1d(int *input, int *output, int round) { | 994 static void dct32_1d(const int *input, int *output, int round) { |
1007 int step[32]; | 995 int step[32]; |
1008 // Stage 1 | 996 // Stage 1 |
1009 step[0] = input[0] + input[(32 - 1)]; | 997 step[0] = input[0] + input[(32 - 1)]; |
1010 step[1] = input[1] + input[(32 - 2)]; | 998 step[1] = input[1] + input[(32 - 2)]; |
1011 step[2] = input[2] + input[(32 - 3)]; | 999 step[2] = input[2] + input[(32 - 3)]; |
1012 step[3] = input[3] + input[(32 - 4)]; | 1000 step[3] = input[3] + input[(32 - 4)]; |
1013 step[4] = input[4] + input[(32 - 5)]; | 1001 step[4] = input[4] + input[(32 - 5)]; |
1014 step[5] = input[5] + input[(32 - 6)]; | 1002 step[5] = input[5] + input[(32 - 6)]; |
1015 step[6] = input[6] + input[(32 - 7)]; | 1003 step[6] = input[6] + input[(32 - 7)]; |
1016 step[7] = input[7] + input[(32 - 8)]; | 1004 step[7] = input[7] + input[(32 - 8)]; |
(...skipping 302 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1319 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); | 1307 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); |
1320 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); | 1308 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); |
1321 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); | 1309 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); |
1322 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); | 1310 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); |
1323 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); | 1311 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); |
1324 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); | 1312 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); |
1325 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); | 1313 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); |
1326 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); | 1314 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
1327 } | 1315 } |
1328 | 1316 |
1329 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { | 1317 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { |
1330 int shortpitch = pitch >> 1; | |
1331 int i, j; | 1318 int i, j; |
1332 int output[32 * 32]; | 1319 int output[32 * 32]; |
1333 | 1320 |
1334 // Columns | 1321 // Columns |
1335 for (i = 0; i < 32; ++i) { | 1322 for (i = 0; i < 32; ++i) { |
1336 int temp_in[32], temp_out[32]; | 1323 int temp_in[32], temp_out[32]; |
1337 for (j = 0; j < 32; ++j) | 1324 for (j = 0; j < 32; ++j) |
1338 temp_in[j] = input[j * shortpitch + i] << 2; | 1325 temp_in[j] = input[j * stride + i] * 4; |
1339 dct32_1d(temp_in, temp_out, 0); | 1326 dct32_1d(temp_in, temp_out, 0); |
1340 for (j = 0; j < 32; ++j) | 1327 for (j = 0; j < 32; ++j) |
1341 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1328 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
1342 } | 1329 } |
1343 | 1330 |
1344 // Rows | 1331 // Rows |
1345 for (i = 0; i < 32; ++i) { | 1332 for (i = 0; i < 32; ++i) { |
1346 int temp_in[32], temp_out[32]; | 1333 int temp_in[32], temp_out[32]; |
1347 for (j = 0; j < 32; ++j) | 1334 for (j = 0; j < 32; ++j) |
1348 temp_in[j] = output[j + i * 32]; | 1335 temp_in[j] = output[j + i * 32]; |
1349 dct32_1d(temp_in, temp_out, 0); | 1336 dct32_1d(temp_in, temp_out, 0); |
1350 for (j = 0; j < 32; ++j) | 1337 for (j = 0; j < 32; ++j) |
1351 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; | 1338 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
1352 } | 1339 } |
1353 } | 1340 } |
1354 | 1341 |
1355 // Note that although we use dct_32_round in dct32_1d computation flow, | 1342 // Note that although we use dct_32_round in dct32_1d computation flow, |
1356 // this 2d fdct32x32 for rate-distortion optimization loop is operating | 1343 // this 2d fdct32x32 for rate-distortion optimization loop is operating |
1357 // within 16 bits precision. | 1344 // within 16 bits precision. |
1358 void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { | 1345 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { |
1359 int shortpitch = pitch >> 1; | |
1360 int i, j; | 1346 int i, j; |
1361 int output[32 * 32]; | 1347 int output[32 * 32]; |
1362 | 1348 |
1363 // Columns | 1349 // Columns |
1364 for (i = 0; i < 32; ++i) { | 1350 for (i = 0; i < 32; ++i) { |
1365 int temp_in[32], temp_out[32]; | 1351 int temp_in[32], temp_out[32]; |
1366 for (j = 0; j < 32; ++j) | 1352 for (j = 0; j < 32; ++j) |
1367 temp_in[j] = input[j * shortpitch + i] << 2; | 1353 temp_in[j] = input[j * stride + i] * 4; |
1368 dct32_1d(temp_in, temp_out, 0); | 1354 dct32_1d(temp_in, temp_out, 0); |
1369 for (j = 0; j < 32; ++j) | 1355 for (j = 0; j < 32; ++j) |
1370 // TODO(cd): see quality impact of only doing | 1356 // TODO(cd): see quality impact of only doing |
1371 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; | 1357 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; |
1372 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c | 1358 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c |
1373 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; | 1359 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
1374 } | 1360 } |
1375 | 1361 |
1376 // Rows | 1362 // Rows |
1377 for (i = 0; i < 32; ++i) { | 1363 for (i = 0; i < 32; ++i) { |
1378 int temp_in[32], temp_out[32]; | 1364 int temp_in[32], temp_out[32]; |
1379 for (j = 0; j < 32; ++j) | 1365 for (j = 0; j < 32; ++j) |
1380 temp_in[j] = output[j + i * 32]; | 1366 temp_in[j] = output[j + i * 32]; |
1381 dct32_1d(temp_in, temp_out, 1); | 1367 dct32_1d(temp_in, temp_out, 1); |
1382 for (j = 0; j < 32; ++j) | 1368 for (j = 0; j < 32; ++j) |
1383 out[j + i * 32] = temp_out[j]; | 1369 out[j + i * 32] = temp_out[j]; |
1384 } | 1370 } |
1385 } | 1371 } |
| 1372 |
| 1373 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1374 int stride) { |
| 1375 if (tx_type == DCT_DCT) |
| 1376 vp9_fdct4x4(input, output, stride); |
| 1377 else |
| 1378 vp9_short_fht4x4(input, output, stride, tx_type); |
| 1379 } |
| 1380 |
| 1381 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1382 int stride) { |
| 1383 if (tx_type == DCT_DCT) |
| 1384 vp9_fdct8x8(input, output, stride); |
| 1385 else |
| 1386 vp9_short_fht8x8(input, output, stride, tx_type); |
| 1387 } |
| 1388 |
| 1389 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
| 1390 int stride) { |
| 1391 if (tx_type == DCT_DCT) |
| 1392 vp9_fdct16x16(input, output, stride); |
| 1393 else |
| 1394 vp9_short_fht16x16(input, output, stride, tx_type); |
| 1395 } |
OLD | NEW |