Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(33)

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_context_tree.c ('k') | source/libvpx/vp9/encoder/vp9_denoiser.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 #include <math.h> 12 #include <math.h>
13 13
14 #include "./vpx_config.h" 14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h" 15 #include "./vp9_rtcd.h"
16 16
17 #include "vp9/common/vp9_blockd.h" 17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h" 18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h" 19 #include "vp9/common/vp9_systemdependent.h"
20 20
21 static INLINE int fdct_round_shift(int input) { 21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23 assert(INT16_MIN <= rv && rv <= INT16_MAX); 23 // TODO(debargha, peter.derivaz): Find new bounds for this assert
24 // and make the bounds consts.
25 // assert(INT16_MIN <= rv && rv <= INT16_MAX);
24 return rv; 26 return rv;
25 } 27 }
26 28
27 static void fdct4(const int16_t *input, int16_t *output) { 29 static void fdct4(const tran_low_t *input, tran_low_t *output) {
28 int16_t step[4]; 30 tran_high_t step[4];
29 int temp1, temp2; 31 tran_high_t temp1, temp2;
30 32
31 step[0] = input[0] + input[3]; 33 step[0] = input[0] + input[3];
32 step[1] = input[1] + input[2]; 34 step[1] = input[1] + input[2];
33 step[2] = input[1] - input[2]; 35 step[2] = input[1] - input[2];
34 step[3] = input[0] - input[3]; 36 step[3] = input[0] - input[3];
35 37
36 temp1 = (step[0] + step[1]) * cospi_16_64; 38 temp1 = (step[0] + step[1]) * cospi_16_64;
37 temp2 = (step[0] - step[1]) * cospi_16_64; 39 temp2 = (step[0] - step[1]) * cospi_16_64;
38 output[0] = fdct_round_shift(temp1); 40 output[0] = fdct_round_shift(temp1);
39 output[2] = fdct_round_shift(temp2); 41 output[2] = fdct_round_shift(temp2);
40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
42 output[1] = fdct_round_shift(temp1); 44 output[1] = fdct_round_shift(temp1);
43 output[3] = fdct_round_shift(temp2); 45 output[3] = fdct_round_shift(temp2);
44 } 46 }
45 47
46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { 48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
47 int r, c; 49 int r, c;
48 int16_t sum = 0; 50 tran_low_t sum = 0;
49 for (r = 0; r < 4; ++r) 51 for (r = 0; r < 4; ++r)
50 for (c = 0; c < 4; ++c) 52 for (c = 0; c < 4; ++c)
51 sum += input[r * stride + c]; 53 sum += input[r * stride + c];
52 54
53 output[0] = sum << 1; 55 output[0] = sum << 1;
54 output[1] = 0; 56 output[1] = 0;
55 } 57 }
56 58
57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { 59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
58 // The 2D transform is done with two passes which are actually pretty 60 // The 2D transform is done with two passes which are actually pretty
59 // similar. In the first one, we transform the columns and transpose 61 // similar. In the first one, we transform the columns and transpose
60 // the results. In the second one, we transform the rows. To achieve that, 62 // the results. In the second one, we transform the rows. To achieve that,
61 // as the first pass results are transposed, we transpose the columns (that 63 // as the first pass results are transposed, we transpose the columns (that
62 // is the transposed rows) and transpose the results (so that it goes back 64 // is the transposed rows) and transpose the results (so that it goes back
63 // in normal/row positions). 65 // in normal/row positions).
64 int pass; 66 int pass;
65 // We need an intermediate buffer between passes. 67 // We need an intermediate buffer between passes.
66 int16_t intermediate[4 * 4]; 68 tran_low_t intermediate[4 * 4];
67 const int16_t *in = input; 69 const int16_t *in_pass0 = input;
68 int16_t *out = intermediate; 70 const tran_low_t *in = NULL;
71 tran_low_t *out = intermediate;
69 // Do the two transform/transpose passes 72 // Do the two transform/transpose passes
70 for (pass = 0; pass < 2; ++pass) { 73 for (pass = 0; pass < 2; ++pass) {
71 /*canbe16*/ int input[4]; 74 tran_high_t input[4]; // canbe16
72 /*canbe16*/ int step[4]; 75 tran_high_t step[4]; // canbe16
73 /*needs32*/ int temp1, temp2; 76 tran_high_t temp1, temp2; // needs32
74 int i; 77 int i;
75 for (i = 0; i < 4; ++i) { 78 for (i = 0; i < 4; ++i) {
76 // Load inputs. 79 // Load inputs.
77 if (0 == pass) { 80 if (0 == pass) {
78 input[0] = in[0 * stride] * 16; 81 input[0] = in_pass0[0 * stride] * 16;
79 input[1] = in[1 * stride] * 16; 82 input[1] = in_pass0[1 * stride] * 16;
80 input[2] = in[2 * stride] * 16; 83 input[2] = in_pass0[2 * stride] * 16;
81 input[3] = in[3 * stride] * 16; 84 input[3] = in_pass0[3 * stride] * 16;
82 if (i == 0 && input[0]) { 85 if (i == 0 && input[0]) {
83 input[0] += 1; 86 input[0] += 1;
84 } 87 }
85 } else { 88 } else {
86 input[0] = in[0 * 4]; 89 input[0] = in[0 * 4];
87 input[1] = in[1 * 4]; 90 input[1] = in[1 * 4];
88 input[2] = in[2 * 4]; 91 input[2] = in[2 * 4];
89 input[3] = in[3 * 4]; 92 input[3] = in[3 * 4];
90 } 93 }
91 // Transform. 94 // Transform.
92 step[0] = input[0] + input[3]; 95 step[0] = input[0] + input[3];
93 step[1] = input[1] + input[2]; 96 step[1] = input[1] + input[2];
94 step[2] = input[1] - input[2]; 97 step[2] = input[1] - input[2];
95 step[3] = input[0] - input[3]; 98 step[3] = input[0] - input[3];
96 temp1 = (step[0] + step[1]) * cospi_16_64; 99 temp1 = (step[0] + step[1]) * cospi_16_64;
97 temp2 = (step[0] - step[1]) * cospi_16_64; 100 temp2 = (step[0] - step[1]) * cospi_16_64;
98 out[0] = fdct_round_shift(temp1); 101 out[0] = fdct_round_shift(temp1);
99 out[2] = fdct_round_shift(temp2); 102 out[2] = fdct_round_shift(temp2);
100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
102 out[1] = fdct_round_shift(temp1); 105 out[1] = fdct_round_shift(temp1);
103 out[3] = fdct_round_shift(temp2); 106 out[3] = fdct_round_shift(temp2);
104 // Do next column (which is a transposed row in second/horizontal pass) 107 // Do next column (which is a transposed row in second/horizontal pass)
108 in_pass0++;
105 in++; 109 in++;
106 out += 4; 110 out += 4;
107 } 111 }
108 // Setup in/out for next pass. 112 // Setup in/out for next pass.
109 in = intermediate; 113 in = intermediate;
110 out = output; 114 out = output;
111 } 115 }
112 116
113 { 117 {
114 int i, j; 118 int i, j;
115 for (i = 0; i < 4; ++i) { 119 for (i = 0; i < 4; ++i) {
116 for (j = 0; j < 4; ++j) 120 for (j = 0; j < 4; ++j)
117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
118 } 122 }
119 } 123 }
120 } 124 }
121 125
122 static void fadst4(const int16_t *input, int16_t *output) { 126 static void fadst4(const tran_low_t *input, tran_low_t *output) {
123 int x0, x1, x2, x3; 127 tran_high_t x0, x1, x2, x3;
124 int s0, s1, s2, s3, s4, s5, s6, s7; 128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
125 129
126 x0 = input[0]; 130 x0 = input[0];
127 x1 = input[1]; 131 x1 = input[1];
128 x2 = input[2]; 132 x2 = input[2];
129 x3 = input[3]; 133 x3 = input[3];
130 134
131 if (!(x0 | x1 | x2 | x3)) { 135 if (!(x0 | x1 | x2 | x3)) {
132 output[0] = output[1] = output[2] = output[3] = 0; 136 output[0] = output[1] = output[2] = output[3] = 0;
133 return; 137 return;
134 } 138 }
(...skipping 24 matching lines...) Expand all
159 output[3] = fdct_round_shift(s3); 163 output[3] = fdct_round_shift(s3);
160 } 164 }
161 165
162 static const transform_2d FHT_4[] = { 166 static const transform_2d FHT_4[] = {
163 { fdct4, fdct4 }, // DCT_DCT = 0 167 { fdct4, fdct4 }, // DCT_DCT = 0
164 { fadst4, fdct4 }, // ADST_DCT = 1 168 { fadst4, fdct4 }, // ADST_DCT = 1
165 { fdct4, fadst4 }, // DCT_ADST = 2 169 { fdct4, fadst4 }, // DCT_ADST = 2
166 { fadst4, fadst4 } // ADST_ADST = 3 170 { fadst4, fadst4 } // ADST_ADST = 3
167 }; 171 };
168 172
169 void vp9_fht4x4_c(const int16_t *input, int16_t *output, 173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
170 int stride, int tx_type) { 174 int stride, int tx_type) {
171 if (tx_type == DCT_DCT) { 175 if (tx_type == DCT_DCT) {
172 vp9_fdct4x4_c(input, output, stride); 176 vp9_fdct4x4_c(input, output, stride);
173 } else { 177 } else {
174 int16_t out[4 * 4]; 178 tran_low_t out[4 * 4];
175 int16_t *outptr = &out[0]; 179 tran_low_t *outptr = &out[0];
176 int i, j; 180 int i, j;
177 int16_t temp_in[4], temp_out[4]; 181 tran_low_t temp_in[4], temp_out[4];
178 const transform_2d ht = FHT_4[tx_type]; 182 const transform_2d ht = FHT_4[tx_type];
179 183
180 // Columns 184 // Columns
181 for (i = 0; i < 4; ++i) { 185 for (i = 0; i < 4; ++i) {
182 for (j = 0; j < 4; ++j) 186 for (j = 0; j < 4; ++j)
183 temp_in[j] = input[j * stride + i] * 16; 187 temp_in[j] = input[j * stride + i] * 16;
184 if (i == 0 && temp_in[0]) 188 if (i == 0 && temp_in[0])
185 temp_in[0] += 1; 189 temp_in[0] += 1;
186 ht.cols(temp_in, temp_out); 190 ht.cols(temp_in, temp_out);
187 for (j = 0; j < 4; ++j) 191 for (j = 0; j < 4; ++j)
188 outptr[j * 4 + i] = temp_out[j]; 192 outptr[j * 4 + i] = temp_out[j];
189 } 193 }
190 194
191 // Rows 195 // Rows
192 for (i = 0; i < 4; ++i) { 196 for (i = 0; i < 4; ++i) {
193 for (j = 0; j < 4; ++j) 197 for (j = 0; j < 4; ++j)
194 temp_in[j] = out[j + i * 4]; 198 temp_in[j] = out[j + i * 4];
195 ht.rows(temp_in, temp_out); 199 ht.rows(temp_in, temp_out);
196 for (j = 0; j < 4; ++j) 200 for (j = 0; j < 4; ++j)
197 output[j + i * 4] = (temp_out[j] + 1) >> 2; 201 output[j + i * 4] = (temp_out[j] + 1) >> 2;
198 } 202 }
199 } 203 }
200 } 204 }
201 205
202 static void fdct8(const int16_t *input, int16_t *output) { 206 static void fdct8(const tran_low_t *input, tran_low_t *output) {
203 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
204 /*needs32*/ int t0, t1, t2, t3; 208 tran_high_t t0, t1, t2, t3; // needs32
205 /*canbe16*/ int x0, x1, x2, x3; 209 tran_high_t x0, x1, x2, x3; // canbe16
206 210
207 // stage 1 211 // stage 1
208 s0 = input[0] + input[7]; 212 s0 = input[0] + input[7];
209 s1 = input[1] + input[6]; 213 s1 = input[1] + input[6];
210 s2 = input[2] + input[5]; 214 s2 = input[2] + input[5];
211 s3 = input[3] + input[4]; 215 s3 = input[3] + input[4];
212 s4 = input[3] - input[4]; 216 s4 = input[3] - input[4];
213 s5 = input[2] - input[5]; 217 s5 = input[2] - input[5];
214 s6 = input[1] - input[6]; 218 s6 = input[1] - input[6];
215 s7 = input[0] - input[7]; 219 s7 = input[0] - input[7];
(...skipping 28 matching lines...) Expand all
244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
248 output[1] = fdct_round_shift(t0); 252 output[1] = fdct_round_shift(t0);
249 output[3] = fdct_round_shift(t2); 253 output[3] = fdct_round_shift(t2);
250 output[5] = fdct_round_shift(t1); 254 output[5] = fdct_round_shift(t1);
251 output[7] = fdct_round_shift(t3); 255 output[7] = fdct_round_shift(t3);
252 } 256 }
253 257
254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { 258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
255 int r, c; 259 int r, c;
256 int16_t sum = 0; 260 tran_low_t sum = 0;
257 for (r = 0; r < 8; ++r) 261 for (r = 0; r < 8; ++r)
258 for (c = 0; c < 8; ++c) 262 for (c = 0; c < 8; ++c)
259 sum += input[r * stride + c]; 263 sum += input[r * stride + c];
260 264
261 output[0] = sum; 265 output[0] = sum;
262 output[1] = 0; 266 output[1] = 0;
263 } 267 }
264 268
265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { 269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
266 int i, j; 270 int i, j;
267 int16_t intermediate[64]; 271 tran_low_t intermediate[64];
268 272
269 // Transform columns 273 // Transform columns
270 { 274 {
271 int16_t *output = intermediate; 275 tran_low_t *output = intermediate;
272 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
273 /*needs32*/ int t0, t1, t2, t3; 277 tran_high_t t0, t1, t2, t3; // needs32
274 /*canbe16*/ int x0, x1, x2, x3; 278 tran_high_t x0, x1, x2, x3; // canbe16
275 279
276 int i; 280 int i;
277 for (i = 0; i < 8; i++) { 281 for (i = 0; i < 8; i++) {
278 // stage 1 282 // stage 1
279 s0 = (input[0 * stride] + input[7 * stride]) * 4; 283 s0 = (input[0 * stride] + input[7 * stride]) * 4;
280 s1 = (input[1 * stride] + input[6 * stride]) * 4; 284 s1 = (input[1 * stride] + input[6 * stride]) * 4;
281 s2 = (input[2 * stride] + input[5 * stride]) * 4; 285 s2 = (input[2 * stride] + input[5 * stride]) * 4;
282 s3 = (input[3 * stride] + input[4 * stride]) * 4; 286 s3 = (input[3 * stride] + input[4 * stride]) * 4;
283 s4 = (input[3 * stride] - input[4 * stride]) * 4; 287 s4 = (input[3 * stride] - input[4 * stride]) * 4;
284 s5 = (input[2 * stride] - input[5 * stride]) * 4; 288 s5 = (input[2 * stride] - input[5 * stride]) * 4;
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
326 } 330 }
327 331
328 // Rows 332 // Rows
329 for (i = 0; i < 8; ++i) { 333 for (i = 0; i < 8; ++i) {
330 fdct8(&intermediate[i * 8], &final_output[i * 8]); 334 fdct8(&intermediate[i * 8], &final_output[i * 8]);
331 for (j = 0; j < 8; ++j) 335 for (j = 0; j < 8; ++j)
332 final_output[j + i * 8] /= 2; 336 final_output[j + i * 8] /= 2;
333 } 337 }
334 } 338 }
335 339
336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { 340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
337 int r, c; 341 int r, c;
338 int16_t sum = 0; 342 tran_low_t sum = 0;
339 for (r = 0; r < 16; ++r) 343 for (r = 0; r < 16; ++r)
340 for (c = 0; c < 16; ++c) 344 for (c = 0; c < 16; ++c)
341 sum += input[r * stride + c]; 345 sum += input[r * stride + c];
342 346
343 output[0] = sum >> 1; 347 output[0] = sum >> 1;
344 output[1] = 0; 348 output[1] = 0;
345 } 349 }
346 350
347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { 351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
348 // The 2D transform is done with two passes which are actually pretty 352 // The 2D transform is done with two passes which are actually pretty
349 // similar. In the first one, we transform the columns and transpose 353 // similar. In the first one, we transform the columns and transpose
350 // the results. In the second one, we transform the rows. To achieve that, 354 // the results. In the second one, we transform the rows. To achieve that,
351 // as the first pass results are transposed, we transpose the columns (that 355 // as the first pass results are transposed, we transpose the columns (that
352 // is the transposed rows) and transpose the results (so that it goes back 356 // is the transposed rows) and transpose the results (so that it goes back
353 // in normal/row positions). 357 // in normal/row positions).
354 int pass; 358 int pass;
355 // We need an intermediate buffer between passes. 359 // We need an intermediate buffer between passes.
356 int16_t intermediate[256]; 360 tran_low_t intermediate[256];
357 const int16_t *in = input; 361 const int16_t *in_pass0 = input;
358 int16_t *out = intermediate; 362 const tran_low_t *in = NULL;
363 tran_low_t *out = intermediate;
359 // Do the two transform/transpose passes 364 // Do the two transform/transpose passes
360 for (pass = 0; pass < 2; ++pass) { 365 for (pass = 0; pass < 2; ++pass) {
361 /*canbe16*/ int step1[8]; 366 tran_high_t step1[8]; // canbe16
362 /*canbe16*/ int step2[8]; 367 tran_high_t step2[8]; // canbe16
363 /*canbe16*/ int step3[8]; 368 tran_high_t step3[8]; // canbe16
364 /*canbe16*/ int input[8]; 369 tran_high_t input[8]; // canbe16
365 /*needs32*/ int temp1, temp2; 370 tran_high_t temp1, temp2; // needs32
366 int i; 371 int i;
367 for (i = 0; i < 16; i++) { 372 for (i = 0; i < 16; i++) {
368 if (0 == pass) { 373 if (0 == pass) {
369 // Calculate input for the first 8 results. 374 // Calculate input for the first 8 results.
370 input[0] = (in[0 * stride] + in[15 * stride]) * 4; 375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
371 input[1] = (in[1 * stride] + in[14 * stride]) * 4; 376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
372 input[2] = (in[2 * stride] + in[13 * stride]) * 4; 377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
373 input[3] = (in[3 * stride] + in[12 * stride]) * 4; 378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
374 input[4] = (in[4 * stride] + in[11 * stride]) * 4; 379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
375 input[5] = (in[5 * stride] + in[10 * stride]) * 4; 380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; 381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; 382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
378 // Calculate input for the next 8 results. 383 // Calculate input for the next 8 results.
379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; 384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; 385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; 386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; 387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; 388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; 389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; 390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; 391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
387 } else { 392 } else {
388 // Calculate input for the first 8 results. 393 // Calculate input for the first 8 results.
389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
397 // Calculate input for the next 8 results. 402 // Calculate input for the next 8 results.
398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
406 } 411 }
407 // Work on the first eight values; fdct8(input, even_results); 412 // Work on the first eight values; fdct8(input, even_results);
408 { 413 {
409 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
410 /*needs32*/ int t0, t1, t2, t3; 415 tran_high_t t0, t1, t2, t3; // needs32
411 /*canbe16*/ int x0, x1, x2, x3; 416 tran_high_t x0, x1, x2, x3; // canbe16
412 417
413 // stage 1 418 // stage 1
414 s0 = input[0] + input[7]; 419 s0 = input[0] + input[7];
415 s1 = input[1] + input[6]; 420 s1 = input[1] + input[6];
416 s2 = input[2] + input[5]; 421 s2 = input[2] + input[5];
417 s3 = input[3] + input[4]; 422 s3 = input[3] + input[4];
418 s4 = input[3] - input[4]; 423 s4 = input[3] - input[4];
419 s5 = input[2] - input[5]; 424 s5 = input[2] - input[5];
420 s6 = input[1] - input[6]; 425 s6 = input[1] - input[6];
421 s7 = input[0] - input[7]; 426 s7 = input[0] - input[7];
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
508 out[3] = fdct_round_shift(temp1); 513 out[3] = fdct_round_shift(temp1);
509 out[11] = fdct_round_shift(temp2); 514 out[11] = fdct_round_shift(temp2);
510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
512 out[7] = fdct_round_shift(temp1); 517 out[7] = fdct_round_shift(temp1);
513 out[15] = fdct_round_shift(temp2); 518 out[15] = fdct_round_shift(temp2);
514 } 519 }
515 // Do next column (which is a transposed row in second/horizontal pass) 520 // Do next column (which is a transposed row in second/horizontal pass)
516 in++; 521 in++;
522 in_pass0++;
517 out += 16; 523 out += 16;
518 } 524 }
519 // Setup in/out for next pass. 525 // Setup in/out for next pass.
520 in = intermediate; 526 in = intermediate;
521 out = output; 527 out = output;
522 } 528 }
523 } 529 }
524 530
525 static void fadst8(const int16_t *input, int16_t *output) { 531 static void fadst8(const tran_low_t *input, tran_low_t *output) {
526 int s0, s1, s2, s3, s4, s5, s6, s7; 532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
527 533
528 int x0 = input[7]; 534 tran_high_t x0 = input[7];
529 int x1 = input[0]; 535 tran_high_t x1 = input[0];
530 int x2 = input[5]; 536 tran_high_t x2 = input[5];
531 int x3 = input[2]; 537 tran_high_t x3 = input[2];
532 int x4 = input[3]; 538 tran_high_t x4 = input[3];
533 int x5 = input[4]; 539 tran_high_t x5 = input[4];
534 int x6 = input[1]; 540 tran_high_t x6 = input[1];
535 int x7 = input[6]; 541 tran_high_t x7 = input[6];
536 542
537 // stage 1 543 // stage 1
538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
593 output[7] = - x1; 599 output[7] = - x1;
594 } 600 }
595 601
596 static const transform_2d FHT_8[] = { 602 static const transform_2d FHT_8[] = {
597 { fdct8, fdct8 }, // DCT_DCT = 0 603 { fdct8, fdct8 }, // DCT_DCT = 0
598 { fadst8, fdct8 }, // ADST_DCT = 1 604 { fadst8, fdct8 }, // ADST_DCT = 1
599 { fdct8, fadst8 }, // DCT_ADST = 2 605 { fdct8, fadst8 }, // DCT_ADST = 2
600 { fadst8, fadst8 } // ADST_ADST = 3 606 { fadst8, fadst8 } // ADST_ADST = 3
601 }; 607 };
602 608
603 void vp9_fht8x8_c(const int16_t *input, int16_t *output, 609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
604 int stride, int tx_type) { 610 int stride, int tx_type) {
605 if (tx_type == DCT_DCT) { 611 if (tx_type == DCT_DCT) {
606 vp9_fdct8x8_c(input, output, stride); 612 vp9_fdct8x8_c(input, output, stride);
607 } else { 613 } else {
608 int16_t out[64]; 614 tran_low_t out[64];
609 int16_t *outptr = &out[0]; 615 tran_low_t *outptr = &out[0];
610 int i, j; 616 int i, j;
611 int16_t temp_in[8], temp_out[8]; 617 tran_low_t temp_in[8], temp_out[8];
612 const transform_2d ht = FHT_8[tx_type]; 618 const transform_2d ht = FHT_8[tx_type];
613 619
614 // Columns 620 // Columns
615 for (i = 0; i < 8; ++i) { 621 for (i = 0; i < 8; ++i) {
616 for (j = 0; j < 8; ++j) 622 for (j = 0; j < 8; ++j)
617 temp_in[j] = input[j * stride + i] * 4; 623 temp_in[j] = input[j * stride + i] * 4;
618 ht.cols(temp_in, temp_out); 624 ht.cols(temp_in, temp_out);
619 for (j = 0; j < 8; ++j) 625 for (j = 0; j < 8; ++j)
620 outptr[j * 8 + i] = temp_out[j]; 626 outptr[j * 8 + i] = temp_out[j];
621 } 627 }
622 628
623 // Rows 629 // Rows
624 for (i = 0; i < 8; ++i) { 630 for (i = 0; i < 8; ++i) {
625 for (j = 0; j < 8; ++j) 631 for (j = 0; j < 8; ++j)
626 temp_in[j] = out[j + i * 8]; 632 temp_in[j] = out[j + i * 8];
627 ht.rows(temp_in, temp_out); 633 ht.rows(temp_in, temp_out);
628 for (j = 0; j < 8; ++j) 634 for (j = 0; j < 8; ++j)
629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
630 } 636 }
631 } 637 }
632 } 638 }
633 639
634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
635 pixel. */ 641 pixel. */
636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { 642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
637 int i; 643 int i;
638 int a1, b1, c1, d1, e1; 644 tran_high_t a1, b1, c1, d1, e1;
639 const int16_t *ip = input; 645 const int16_t *ip_pass0 = input;
640 int16_t *op = output; 646 const tran_low_t *ip = NULL;
647 tran_low_t *op = output;
641 648
642 for (i = 0; i < 4; i++) { 649 for (i = 0; i < 4; i++) {
643 a1 = ip[0 * stride]; 650 a1 = ip_pass0[0 * stride];
644 b1 = ip[1 * stride]; 651 b1 = ip_pass0[1 * stride];
645 c1 = ip[2 * stride]; 652 c1 = ip_pass0[2 * stride];
646 d1 = ip[3 * stride]; 653 d1 = ip_pass0[3 * stride];
647 654
648 a1 += b1; 655 a1 += b1;
649 d1 = d1 - c1; 656 d1 = d1 - c1;
650 e1 = (a1 - d1) >> 1; 657 e1 = (a1 - d1) >> 1;
651 b1 = e1 - b1; 658 b1 = e1 - b1;
652 c1 = e1 - c1; 659 c1 = e1 - c1;
653 a1 -= c1; 660 a1 -= c1;
654 d1 += b1; 661 d1 += b1;
655 op[0] = a1; 662 op[0] = a1;
656 op[4] = c1; 663 op[4] = c1;
657 op[8] = d1; 664 op[8] = d1;
658 op[12] = b1; 665 op[12] = b1;
659 666
660 ip++; 667 ip_pass0++;
661 op++; 668 op++;
662 } 669 }
663 ip = output; 670 ip = output;
664 op = output; 671 op = output;
665 672
666 for (i = 0; i < 4; i++) { 673 for (i = 0; i < 4; i++) {
667 a1 = ip[0]; 674 a1 = ip[0];
668 b1 = ip[1]; 675 b1 = ip[1];
669 c1 = ip[2]; 676 c1 = ip[2];
670 d1 = ip[3]; 677 d1 = ip[3];
671 678
672 a1 += b1; 679 a1 += b1;
673 d1 -= c1; 680 d1 -= c1;
674 e1 = (a1 - d1) >> 1; 681 e1 = (a1 - d1) >> 1;
675 b1 = e1 - b1; 682 b1 = e1 - b1;
676 c1 = e1 - c1; 683 c1 = e1 - c1;
677 a1 -= c1; 684 a1 -= c1;
678 d1 += b1; 685 d1 += b1;
679 op[0] = a1 * UNIT_QUANT_FACTOR; 686 op[0] = a1 * UNIT_QUANT_FACTOR;
680 op[1] = c1 * UNIT_QUANT_FACTOR; 687 op[1] = c1 * UNIT_QUANT_FACTOR;
681 op[2] = d1 * UNIT_QUANT_FACTOR; 688 op[2] = d1 * UNIT_QUANT_FACTOR;
682 op[3] = b1 * UNIT_QUANT_FACTOR; 689 op[3] = b1 * UNIT_QUANT_FACTOR;
683 690
684 ip += 4; 691 ip += 4;
685 op += 4; 692 op += 4;
686 } 693 }
687 } 694 }
688 695
689 // Rewrote to use same algorithm as others. 696 // Rewrote to use same algorithm as others.
690 static void fdct16(const int16_t in[16], int16_t out[16]) { 697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
691 /*canbe16*/ int step1[8]; 698 tran_high_t step1[8]; // canbe16
692 /*canbe16*/ int step2[8]; 699 tran_high_t step2[8]; // canbe16
693 /*canbe16*/ int step3[8]; 700 tran_high_t step3[8]; // canbe16
694 /*canbe16*/ int input[8]; 701 tran_high_t input[8]; // canbe16
695 /*needs32*/ int temp1, temp2; 702 tran_high_t temp1, temp2; // needs32
696 703
697 // step 1 704 // step 1
698 input[0] = in[0] + in[15]; 705 input[0] = in[0] + in[15];
699 input[1] = in[1] + in[14]; 706 input[1] = in[1] + in[14];
700 input[2] = in[2] + in[13]; 707 input[2] = in[2] + in[13];
701 input[3] = in[3] + in[12]; 708 input[3] = in[3] + in[12];
702 input[4] = in[4] + in[11]; 709 input[4] = in[4] + in[11];
703 input[5] = in[5] + in[10]; 710 input[5] = in[5] + in[10];
704 input[6] = in[6] + in[ 9]; 711 input[6] = in[6] + in[ 9];
705 input[7] = in[7] + in[ 8]; 712 input[7] = in[7] + in[ 8];
706 713
707 step1[0] = in[7] - in[ 8]; 714 step1[0] = in[7] - in[ 8];
708 step1[1] = in[6] - in[ 9]; 715 step1[1] = in[6] - in[ 9];
709 step1[2] = in[5] - in[10]; 716 step1[2] = in[5] - in[10];
710 step1[3] = in[4] - in[11]; 717 step1[3] = in[4] - in[11];
711 step1[4] = in[3] - in[12]; 718 step1[4] = in[3] - in[12];
712 step1[5] = in[2] - in[13]; 719 step1[5] = in[2] - in[13];
713 step1[6] = in[1] - in[14]; 720 step1[6] = in[1] - in[14];
714 step1[7] = in[0] - in[15]; 721 step1[7] = in[0] - in[15];
715 722
716 // fdct8(step, step); 723 // fdct8(step, step);
717 { 724 {
718 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
719 /*needs32*/ int t0, t1, t2, t3; 726 tran_high_t t0, t1, t2, t3; // needs32
720 /*canbe16*/ int x0, x1, x2, x3; 727 tran_high_t x0, x1, x2, x3; // canbe16
721 728
722 // stage 1 729 // stage 1
723 s0 = input[0] + input[7]; 730 s0 = input[0] + input[7];
724 s1 = input[1] + input[6]; 731 s1 = input[1] + input[6];
725 s2 = input[2] + input[5]; 732 s2 = input[2] + input[5];
726 s3 = input[3] + input[4]; 733 s3 = input[3] + input[4];
727 s4 = input[3] - input[4]; 734 s4 = input[3] - input[4];
728 s5 = input[2] - input[5]; 735 s5 = input[2] - input[5];
729 s6 = input[1] - input[6]; 736 s6 = input[1] - input[6];
730 s7 = input[0] - input[7]; 737 s7 = input[0] - input[7];
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
822 out[3] = fdct_round_shift(temp1); 829 out[3] = fdct_round_shift(temp1);
823 out[11] = fdct_round_shift(temp2); 830 out[11] = fdct_round_shift(temp2);
824 831
825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
827 out[7] = fdct_round_shift(temp1); 834 out[7] = fdct_round_shift(temp1);
828 out[15] = fdct_round_shift(temp2); 835 out[15] = fdct_round_shift(temp2);
829 } 836 }
830 837
831 static void fadst16(const int16_t *input, int16_t *output) { 838 static void fadst16(const tran_low_t *input, tran_low_t *output) {
832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
840 tran_high_t s9, s10, s11, s12, s13, s14, s15;
833 841
834 int x0 = input[15]; 842 tran_high_t x0 = input[15];
835 int x1 = input[0]; 843 tran_high_t x1 = input[0];
836 int x2 = input[13]; 844 tran_high_t x2 = input[13];
837 int x3 = input[2]; 845 tran_high_t x3 = input[2];
838 int x4 = input[11]; 846 tran_high_t x4 = input[11];
839 int x5 = input[4]; 847 tran_high_t x5 = input[4];
840 int x6 = input[9]; 848 tran_high_t x6 = input[9];
841 int x7 = input[6]; 849 tran_high_t x7 = input[6];
842 int x8 = input[7]; 850 tran_high_t x8 = input[7];
843 int x9 = input[8]; 851 tran_high_t x9 = input[8];
844 int x10 = input[5]; 852 tran_high_t x10 = input[5];
845 int x11 = input[10]; 853 tran_high_t x11 = input[10];
846 int x12 = input[3]; 854 tran_high_t x12 = input[3];
847 int x13 = input[12]; 855 tran_high_t x13 = input[12];
848 int x14 = input[1]; 856 tran_high_t x14 = input[1];
849 int x15 = input[14]; 857 tran_high_t x15 = input[14];
850 858
851 // stage 1 859 // stage 1
852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after
990 output[15] = - x1; 998 output[15] = - x1;
991 } 999 }
992 1000
993 static const transform_2d FHT_16[] = { 1001 static const transform_2d FHT_16[] = {
994 { fdct16, fdct16 }, // DCT_DCT = 0 1002 { fdct16, fdct16 }, // DCT_DCT = 0
995 { fadst16, fdct16 }, // ADST_DCT = 1 1003 { fadst16, fdct16 }, // ADST_DCT = 1
996 { fdct16, fadst16 }, // DCT_ADST = 2 1004 { fdct16, fadst16 }, // DCT_ADST = 2
997 { fadst16, fadst16 } // ADST_ADST = 3 1005 { fadst16, fadst16 } // ADST_ADST = 3
998 }; 1006 };
999 1007
1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output, 1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
1001 int stride, int tx_type) { 1009 int stride, int tx_type) {
1002 if (tx_type == DCT_DCT) { 1010 if (tx_type == DCT_DCT) {
1003 vp9_fdct16x16_c(input, output, stride); 1011 vp9_fdct16x16_c(input, output, stride);
1004 } else { 1012 } else {
1005 int16_t out[256]; 1013 tran_low_t out[256];
1006 int16_t *outptr = &out[0]; 1014 tran_low_t *outptr = &out[0];
1007 int i, j; 1015 int i, j;
1008 int16_t temp_in[16], temp_out[16]; 1016 tran_low_t temp_in[16], temp_out[16];
1009 const transform_2d ht = FHT_16[tx_type]; 1017 const transform_2d ht = FHT_16[tx_type];
1010 1018
1011 // Columns 1019 // Columns
1012 for (i = 0; i < 16; ++i) { 1020 for (i = 0; i < 16; ++i) {
1013 for (j = 0; j < 16; ++j) 1021 for (j = 0; j < 16; ++j)
1014 temp_in[j] = input[j * stride + i] * 4; 1022 temp_in[j] = input[j * stride + i] * 4;
1015 ht.cols(temp_in, temp_out); 1023 ht.cols(temp_in, temp_out);
1016 for (j = 0; j < 16; ++j) 1024 for (j = 0; j < 16; ++j)
1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1018 } 1026 }
1019 1027
1020 // Rows 1028 // Rows
1021 for (i = 0; i < 16; ++i) { 1029 for (i = 0; i < 16; ++i) {
1022 for (j = 0; j < 16; ++j) 1030 for (j = 0; j < 16; ++j)
1023 temp_in[j] = out[j + i * 16]; 1031 temp_in[j] = out[j + i * 16];
1024 ht.rows(temp_in, temp_out); 1032 ht.rows(temp_in, temp_out);
1025 for (j = 0; j < 16; ++j) 1033 for (j = 0; j < 16; ++j)
1026 output[j + i * 16] = temp_out[j]; 1034 output[j + i * 16] = temp_out[j];
1027 } 1035 }
1028 } 1036 }
1029 } 1037 }
1030 1038
1031 static INLINE int dct_32_round(int input) { 1039 static INLINE tran_high_t dct_32_round(tran_high_t input) {
1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1033 assert(-131072 <= rv && rv <= 131071); 1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert,
1042 // and make the bounds consts.
1043 // assert(-131072 <= rv && rv <= 131071);
1034 return rv; 1044 return rv;
1035 } 1045 }
1036 1046
1037 static INLINE int half_round_shift(int input) { 1047 static INLINE tran_high_t half_round_shift(tran_high_t input) {
1038 int rv = (input + 1 + (input < 0)) >> 2; 1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2;
1039 return rv; 1049 return rv;
1040 } 1050 }
1041 1051
1042 static void fdct32(const int *input, int *output, int round) { 1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
1043 int step[32]; 1053 tran_high_t step[32];
1044 // Stage 1 1054 // Stage 1
1045 step[0] = input[0] + input[(32 - 1)]; 1055 step[0] = input[0] + input[(32 - 1)];
1046 step[1] = input[1] + input[(32 - 2)]; 1056 step[1] = input[1] + input[(32 - 2)];
1047 step[2] = input[2] + input[(32 - 3)]; 1057 step[2] = input[2] + input[(32 - 3)];
1048 step[3] = input[3] + input[(32 - 4)]; 1058 step[3] = input[3] + input[(32 - 4)];
1049 step[4] = input[4] + input[(32 - 5)]; 1059 step[4] = input[4] + input[(32 - 5)];
1050 step[5] = input[5] + input[(32 - 6)]; 1060 step[5] = input[5] + input[(32 - 6)];
1051 step[6] = input[6] + input[(32 - 7)]; 1061 step[6] = input[6] + input[(32 - 7)];
1052 step[7] = input[7] + input[(32 - 8)]; 1062 step[7] = input[7] + input[(32 - 8)];
1053 step[8] = input[8] + input[(32 - 9)]; 1063 step[8] = input[8] + input[(32 - 9)];
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after
1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1363 } 1373 }
1364 1374
1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { 1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
1366 int r, c; 1376 int r, c;
1367 int16_t sum = 0; 1377 tran_low_t sum = 0;
1368 for (r = 0; r < 32; ++r) 1378 for (r = 0; r < 32; ++r)
1369 for (c = 0; c < 32; ++c) 1379 for (c = 0; c < 32; ++c)
1370 sum += input[r * stride + c]; 1380 sum += input[r * stride + c];
1371 1381
1372 output[0] = sum >> 3; 1382 output[0] = sum >> 3;
1373 output[1] = 0; 1383 output[1] = 0;
1374 } 1384 }
1375 1385
1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { 1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1377 int i, j; 1387 int i, j;
1378 int output[32 * 32]; 1388 tran_high_t output[32 * 32];
1379 1389
1380 // Columns 1390 // Columns
1381 for (i = 0; i < 32; ++i) { 1391 for (i = 0; i < 32; ++i) {
1382 int temp_in[32], temp_out[32]; 1392 tran_high_t temp_in[32], temp_out[32];
1383 for (j = 0; j < 32; ++j) 1393 for (j = 0; j < 32; ++j)
1384 temp_in[j] = input[j * stride + i] * 4; 1394 temp_in[j] = input[j * stride + i] * 4;
1385 fdct32(temp_in, temp_out, 0); 1395 fdct32(temp_in, temp_out, 0);
1386 for (j = 0; j < 32; ++j) 1396 for (j = 0; j < 32; ++j)
1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1388 } 1398 }
1389 1399
1390 // Rows 1400 // Rows
1391 for (i = 0; i < 32; ++i) { 1401 for (i = 0; i < 32; ++i) {
1392 int temp_in[32], temp_out[32]; 1402 tran_high_t temp_in[32], temp_out[32];
1393 for (j = 0; j < 32; ++j) 1403 for (j = 0; j < 32; ++j)
1394 temp_in[j] = output[j + i * 32]; 1404 temp_in[j] = output[j + i * 32];
1395 fdct32(temp_in, temp_out, 0); 1405 fdct32(temp_in, temp_out, 0);
1396 for (j = 0; j < 32; ++j) 1406 for (j = 0; j < 32; ++j)
1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1398 } 1408 }
1399 } 1409 }
1400 1410
1401 // Note that although we use dct_32_round in dct32 computation flow, 1411 // Note that although we use dct_32_round in dct32 computation flow,
1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating 1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1403 // within 16 bits precision. 1413 // within 16 bits precision.
1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { 1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
1405 int i, j; 1415 int i, j;
1406 int output[32 * 32]; 1416 tran_high_t output[32 * 32];
1407 1417
1408 // Columns 1418 // Columns
1409 for (i = 0; i < 32; ++i) { 1419 for (i = 0; i < 32; ++i) {
1410 int temp_in[32], temp_out[32]; 1420 tran_high_t temp_in[32], temp_out[32];
1411 for (j = 0; j < 32; ++j) 1421 for (j = 0; j < 32; ++j)
1412 temp_in[j] = input[j * stride + i] * 4; 1422 temp_in[j] = input[j * stride + i] * 4;
1413 fdct32(temp_in, temp_out, 0); 1423 fdct32(temp_in, temp_out, 0);
1414 for (j = 0; j < 32; ++j) 1424 for (j = 0; j < 32; ++j)
1415 // TODO(cd): see quality impact of only doing 1425 // TODO(cd): see quality impact of only doing
1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1419 } 1429 }
1420 1430
1421 // Rows 1431 // Rows
1422 for (i = 0; i < 32; ++i) { 1432 for (i = 0; i < 32; ++i) {
1423 int temp_in[32], temp_out[32]; 1433 tran_high_t temp_in[32], temp_out[32];
1424 for (j = 0; j < 32; ++j) 1434 for (j = 0; j < 32; ++j)
1425 temp_in[j] = output[j + i * 32]; 1435 temp_in[j] = output[j + i * 32];
1426 fdct32(temp_in, temp_out, 1); 1436 fdct32(temp_in, temp_out, 1);
1427 for (j = 0; j < 32; ++j) 1437 for (j = 0; j < 32; ++j)
1428 out[j + i * 32] = temp_out[j]; 1438 out[j + i * 32] = temp_out[j];
1429 } 1439 }
1430 } 1440 }
1441
1442 #if CONFIG_VP9_HIGHBITDEPTH
1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1444 vp9_fdct4x4_c(input, output, stride);
1445 }
1446
1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
1448 int stride, int tx_type) {
1449 vp9_fht4x4_c(input, output, stride, tx_type);
1450 }
1451
1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
1453 int stride) {
1454 vp9_fdct8x8_1_c(input, final_output, stride);
1455 }
1456
1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
1458 int stride) {
1459 vp9_fdct8x8_c(input, final_output, stride);
1460 }
1461
1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
1463 int stride) {
1464 vp9_fdct16x16_1_c(input, output, stride);
1465 }
1466
1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
1468 int stride) {
1469 vp9_fdct16x16_c(input, output, stride);
1470 }
1471
1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
1473 int stride, int tx_type) {
1474 vp9_fht8x8_c(input, output, stride, tx_type);
1475 }
1476
1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1478 vp9_fwht4x4_c(input, output, stride);
1479 }
1480
1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
1482 int stride, int tx_type) {
1483 vp9_fht16x16_c(input, output, stride, tx_type);
1484 }
1485
1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
1487 vp9_fdct32x32_1_c(input, out, stride);
1488 }
1489
1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1491 vp9_fdct32x32_c(input, out, stride);
1492 }
1493
1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
1495 int stride) {
1496 vp9_fdct32x32_rd_c(input, out, stride);
1497 }
1498 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_context_tree.c ('k') | source/libvpx/vp9/encoder/vp9_denoiser.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698