Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: source/libvpx/vp9/common/vp9_idct.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 #include <math.h> 12 #include <math.h>
13 13
14 #include "./vpx_config.h" 14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h" 15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_systemdependent.h" 16 #include "vp9/common/vp9_systemdependent.h"
17 #include "vp9/common/vp9_blockd.h" 17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_common.h" 18 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_idct.h" 19 #include "vp9/common/vp9_idct.h"
20 20
21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
23 // overflow wrapping to match expected hardware implementations.
24 // bd of 8 uses trans_low with 16bits, need to remove 16bits
25 // bd of 10 uses trans_low with 18bits, need to remove 14bits
26 // bd of 12 uses trans_low with 20bits, need to remove 12bits
27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
29 #else
30 #define WRAPLOW(x) (x)
31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
32
33 #if CONFIG_VP9_HIGHBITDEPTH
34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
35 tran_low_t high) {
36 return value < low ? low : (value > high ? high : value);
37 }
38
39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
40 tran_high_t trans, int bd) {
41 trans = WRAPLOW(trans);
42 switch (bd) {
43 case 8:
44 default:
45 return clamp_high(WRAPLOW(dest + trans), 0, 255);
46 case 10:
47 return clamp_high(WRAPLOW(dest + trans), 0, 1023);
48 case 12:
49 return clamp_high(WRAPLOW(dest + trans), 0, 4095);
50 }
51 }
52 #endif // CONFIG_VP9_HIGHBITDEPTH
53
54 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23 0.5 shifts per pixel. */ 56 0.5 shifts per pixel. */
24 int i; 57 int i;
25 int16_t output[16]; 58 tran_low_t output[16];
26 int a1, b1, c1, d1, e1; 59 tran_high_t a1, b1, c1, d1, e1;
27 const int16_t *ip = input; 60 const tran_low_t *ip = input;
28 int16_t *op = output; 61 tran_low_t *op = output;
29 62
30 for (i = 0; i < 4; i++) { 63 for (i = 0; i < 4; i++) {
31 a1 = ip[0] >> UNIT_QUANT_SHIFT; 64 a1 = ip[0] >> UNIT_QUANT_SHIFT;
32 c1 = ip[1] >> UNIT_QUANT_SHIFT; 65 c1 = ip[1] >> UNIT_QUANT_SHIFT;
33 d1 = ip[2] >> UNIT_QUANT_SHIFT; 66 d1 = ip[2] >> UNIT_QUANT_SHIFT;
34 b1 = ip[3] >> UNIT_QUANT_SHIFT; 67 b1 = ip[3] >> UNIT_QUANT_SHIFT;
35 a1 += c1; 68 a1 += c1;
36 d1 -= b1; 69 d1 -= b1;
37 e1 = (a1 - d1) >> 1; 70 e1 = (a1 - d1) >> 1;
38 b1 = e1 - b1; 71 b1 = e1 - b1;
(...skipping 24 matching lines...) Expand all
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); 96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); 97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); 98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); 99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67 100
68 ip++; 101 ip++;
69 dest++; 102 dest++;
70 } 103 }
71 } 104 }
72 105
73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { 106 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
74 int i; 107 int i;
75 int a1, e1; 108 tran_high_t a1, e1;
76 int16_t tmp[4]; 109 tran_low_t tmp[4];
77 const int16_t *ip = in; 110 const tran_low_t *ip = in;
78 int16_t *op = tmp; 111 tran_low_t *op = tmp;
79 112
80 a1 = ip[0] >> UNIT_QUANT_SHIFT; 113 a1 = ip[0] >> UNIT_QUANT_SHIFT;
81 e1 = a1 >> 1; 114 e1 = a1 >> 1;
82 a1 -= e1; 115 a1 -= e1;
83 op[0] = a1; 116 op[0] = a1;
84 op[1] = op[2] = op[3] = e1; 117 op[1] = op[2] = op[3] = e1;
85 118
86 ip = tmp; 119 ip = tmp;
87 for (i = 0; i < 4; i++) { 120 for (i = 0; i < 4; i++) {
88 e1 = ip[0] >> 1; 121 e1 = ip[0] >> 1;
89 a1 = ip[0] - e1; 122 a1 = ip[0] - e1;
90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); 123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); 124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); 125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); 126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94 ip++; 127 ip++;
95 dest++; 128 dest++;
96 } 129 }
97 } 130 }
98 131
99 static void idct4(const int16_t *input, int16_t *output) { 132 static void idct4(const tran_low_t *input, tran_low_t *output) {
100 int16_t step[4]; 133 tran_low_t step[4];
101 int temp1, temp2; 134 tran_high_t temp1, temp2;
102 // stage 1 135 // stage 1
103 temp1 = (input[0] + input[2]) * cospi_16_64; 136 temp1 = (input[0] + input[2]) * cospi_16_64;
104 temp2 = (input[0] - input[2]) * cospi_16_64; 137 temp2 = (input[0] - input[2]) * cospi_16_64;
105 step[0] = dct_const_round_shift(temp1); 138 step[0] = dct_const_round_shift(temp1);
106 step[1] = dct_const_round_shift(temp2); 139 step[1] = dct_const_round_shift(temp2);
107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109 step[2] = dct_const_round_shift(temp1); 142 step[2] = dct_const_round_shift(temp1);
110 step[3] = dct_const_round_shift(temp2); 143 step[3] = dct_const_round_shift(temp2);
111 144
112 // stage 2 145 // stage 2
113 output[0] = step[0] + step[3]; 146 output[0] = step[0] + step[3];
114 output[1] = step[1] + step[2]; 147 output[1] = step[1] + step[2];
115 output[2] = step[1] - step[2]; 148 output[2] = step[1] - step[2];
116 output[3] = step[0] - step[3]; 149 output[3] = step[0] - step[3];
117 } 150 }
118 151
119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 152 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
120 int16_t out[4 * 4]; 153 tran_low_t out[4 * 4];
121 int16_t *outptr = out; 154 tran_low_t *outptr = out;
122 int i, j; 155 int i, j;
123 int16_t temp_in[4], temp_out[4]; 156 tran_low_t temp_in[4], temp_out[4];
124 157
125 // Rows 158 // Rows
126 for (i = 0; i < 4; ++i) { 159 for (i = 0; i < 4; ++i) {
127 idct4(input, outptr); 160 idct4(input, outptr);
128 input += 4; 161 input += 4;
129 outptr += 4; 162 outptr += 4;
130 } 163 }
131 164
132 // Columns 165 // Columns
133 for (i = 0; i < 4; ++i) { 166 for (i = 0; i < 4; ++i) {
134 for (j = 0; j < 4; ++j) 167 for (j = 0; j < 4; ++j)
135 temp_in[j] = out[j * 4 + i]; 168 temp_in[j] = out[j * 4 + i];
136 idct4(temp_in, temp_out); 169 idct4(temp_in, temp_out);
137 for (j = 0; j < 4; ++j) 170 for (j = 0; j < 4; ++j)
138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
139 + dest[j * stride + i]); 172 + dest[j * stride + i]);
140 } 173 }
141 } 174 }
142 175
143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { 176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
177 int dest_stride) {
144 int i; 178 int i;
145 int a1; 179 tran_high_t a1;
146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
147 out = dct_const_round_shift(out * cospi_16_64); 181 out = dct_const_round_shift(out * cospi_16_64);
148 a1 = ROUND_POWER_OF_TWO(out, 4); 182 a1 = ROUND_POWER_OF_TWO(out, 4);
149 183
150 for (i = 0; i < 4; i++) { 184 for (i = 0; i < 4; i++) {
151 dest[0] = clip_pixel(dest[0] + a1); 185 dest[0] = clip_pixel(dest[0] + a1);
152 dest[1] = clip_pixel(dest[1] + a1); 186 dest[1] = clip_pixel(dest[1] + a1);
153 dest[2] = clip_pixel(dest[2] + a1); 187 dest[2] = clip_pixel(dest[2] + a1);
154 dest[3] = clip_pixel(dest[3] + a1); 188 dest[3] = clip_pixel(dest[3] + a1);
155 dest += dest_stride; 189 dest += dest_stride;
156 } 190 }
157 } 191 }
158 192
159 static void idct8(const int16_t *input, int16_t *output) { 193 static void idct8(const tran_low_t *input, tran_low_t *output) {
160 int16_t step1[8], step2[8]; 194 tran_low_t step1[8], step2[8];
161 int temp1, temp2; 195 tran_high_t temp1, temp2;
162 // stage 1 196 // stage 1
163 step1[0] = input[0]; 197 step1[0] = input[0];
164 step1[2] = input[4]; 198 step1[2] = input[4];
165 step1[1] = input[2]; 199 step1[1] = input[2];
166 step1[3] = input[6]; 200 step1[3] = input[6];
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 step1[4] = dct_const_round_shift(temp1); 203 step1[4] = dct_const_round_shift(temp1);
170 step1[7] = dct_const_round_shift(temp2); 204 step1[7] = dct_const_round_shift(temp2);
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
(...skipping 22 matching lines...) Expand all
194 output[0] = step1[0] + step1[7]; 228 output[0] = step1[0] + step1[7];
195 output[1] = step1[1] + step1[6]; 229 output[1] = step1[1] + step1[6];
196 output[2] = step1[2] + step1[5]; 230 output[2] = step1[2] + step1[5];
197 output[3] = step1[3] + step1[4]; 231 output[3] = step1[3] + step1[4];
198 output[4] = step1[3] - step1[4]; 232 output[4] = step1[3] - step1[4];
199 output[5] = step1[2] - step1[5]; 233 output[5] = step1[2] - step1[5];
200 output[6] = step1[1] - step1[6]; 234 output[6] = step1[1] - step1[6];
201 output[7] = step1[0] - step1[7]; 235 output[7] = step1[0] - step1[7];
202 } 236 }
203 237
204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { 238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
205 int16_t out[8 * 8]; 239 tran_low_t out[8 * 8];
206 int16_t *outptr = out; 240 tran_low_t *outptr = out;
207 int i, j; 241 int i, j;
208 int16_t temp_in[8], temp_out[8]; 242 tran_low_t temp_in[8], temp_out[8];
209 243
210 // First transform rows 244 // First transform rows
211 for (i = 0; i < 8; ++i) { 245 for (i = 0; i < 8; ++i) {
212 idct8(input, outptr); 246 idct8(input, outptr);
213 input += 8; 247 input += 8;
214 outptr += 8; 248 outptr += 8;
215 } 249 }
216 250
217 // Then transform columns 251 // Then transform columns
218 for (i = 0; i < 8; ++i) { 252 for (i = 0; i < 8; ++i) {
219 for (j = 0; j < 8; ++j) 253 for (j = 0; j < 8; ++j)
220 temp_in[j] = out[j * 8 + i]; 254 temp_in[j] = out[j * 8 + i];
221 idct8(temp_in, temp_out); 255 idct8(temp_in, temp_out);
222 for (j = 0; j < 8; ++j) 256 for (j = 0; j < 8; ++j)
223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
224 + dest[j * stride + i]); 258 + dest[j * stride + i]);
225 } 259 }
226 } 260 }
227 261
228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 262 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
229 int i, j; 263 int i, j;
230 int a1; 264 tran_high_t a1;
231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
232 out = dct_const_round_shift(out * cospi_16_64); 266 out = dct_const_round_shift(out * cospi_16_64);
233 a1 = ROUND_POWER_OF_TWO(out, 5); 267 a1 = ROUND_POWER_OF_TWO(out, 5);
234 for (j = 0; j < 8; ++j) { 268 for (j = 0; j < 8; ++j) {
235 for (i = 0; i < 8; ++i) 269 for (i = 0; i < 8; ++i)
236 dest[i] = clip_pixel(dest[i] + a1); 270 dest[i] = clip_pixel(dest[i] + a1);
237 dest += stride; 271 dest += stride;
238 } 272 }
239 } 273 }
240 274
241 static void iadst4(const int16_t *input, int16_t *output) { 275 static void iadst4(const tran_low_t *input, tran_low_t *output) {
242 int s0, s1, s2, s3, s4, s5, s6, s7; 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
243 277
244 int x0 = input[0]; 278 tran_high_t x0 = input[0];
245 int x1 = input[1]; 279 tran_high_t x1 = input[1];
246 int x2 = input[2]; 280 tran_high_t x2 = input[2];
247 int x3 = input[3]; 281 tran_high_t x3 = input[3];
248 282
249 if (!(x0 | x1 | x2 | x3)) { 283 if (!(x0 | x1 | x2 | x3)) {
250 output[0] = output[1] = output[2] = output[3] = 0; 284 output[0] = output[1] = output[2] = output[3] = 0;
251 return; 285 return;
252 } 286 }
253 287
254 s0 = sinpi_1_9 * x0; 288 s0 = sinpi_1_9 * x0;
255 s1 = sinpi_2_9 * x0; 289 s1 = sinpi_2_9 * x0;
256 s2 = sinpi_3_9 * x1; 290 s2 = sinpi_3_9 * x1;
257 s3 = sinpi_4_9 * x2; 291 s3 = sinpi_4_9 * x2;
(...skipping 15 matching lines...) Expand all
273 // 1-D transform scaling factor is sqrt(2). 307 // 1-D transform scaling factor is sqrt(2).
274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275 // + 1b (addition) = 29b. 309 // + 1b (addition) = 29b.
276 // Hence the output bit depth is 15b. 310 // Hence the output bit depth is 15b.
277 output[0] = dct_const_round_shift(s0); 311 output[0] = dct_const_round_shift(s0);
278 output[1] = dct_const_round_shift(s1); 312 output[1] = dct_const_round_shift(s1);
279 output[2] = dct_const_round_shift(s2); 313 output[2] = dct_const_round_shift(s2);
280 output[3] = dct_const_round_shift(s3); 314 output[3] = dct_const_round_shift(s3);
281 } 315 }
282 316
283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, 317 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
284 int tx_type) { 318 int tx_type) {
285 const transform_2d IHT_4[] = { 319 const transform_2d IHT_4[] = {
286 { idct4, idct4 }, // DCT_DCT = 0 320 { idct4, idct4 }, // DCT_DCT = 0
287 { iadst4, idct4 }, // ADST_DCT = 1 321 { iadst4, idct4 }, // ADST_DCT = 1
288 { idct4, iadst4 }, // DCT_ADST = 2 322 { idct4, iadst4 }, // DCT_ADST = 2
289 { iadst4, iadst4 } // ADST_ADST = 3 323 { iadst4, iadst4 } // ADST_ADST = 3
290 }; 324 };
291 325
292 int i, j; 326 int i, j;
293 int16_t out[4 * 4]; 327 tran_low_t out[4 * 4];
294 int16_t *outptr = out; 328 tran_low_t *outptr = out;
295 int16_t temp_in[4], temp_out[4]; 329 tran_low_t temp_in[4], temp_out[4];
296 330
297 // inverse transform row vectors 331 // inverse transform row vectors
298 for (i = 0; i < 4; ++i) { 332 for (i = 0; i < 4; ++i) {
299 IHT_4[tx_type].rows(input, outptr); 333 IHT_4[tx_type].rows(input, outptr);
300 input += 4; 334 input += 4;
301 outptr += 4; 335 outptr += 4;
302 } 336 }
303 337
304 // inverse transform column vectors 338 // inverse transform column vectors
305 for (i = 0; i < 4; ++i) { 339 for (i = 0; i < 4; ++i) {
306 for (j = 0; j < 4; ++j) 340 for (j = 0; j < 4; ++j)
307 temp_in[j] = out[j * 4 + i]; 341 temp_in[j] = out[j * 4 + i];
308 IHT_4[tx_type].cols(temp_in, temp_out); 342 IHT_4[tx_type].cols(temp_in, temp_out);
309 for (j = 0; j < 4; ++j) 343 for (j = 0; j < 4; ++j)
310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
311 + dest[j * stride + i]); 345 + dest[j * stride + i]);
312 } 346 }
313 } 347 }
314 static void iadst8(const int16_t *input, int16_t *output) { 348 static void iadst8(const tran_low_t *input, tran_low_t *output) {
315 int s0, s1, s2, s3, s4, s5, s6, s7; 349 int s0, s1, s2, s3, s4, s5, s6, s7;
316 350
317 int x0 = input[7]; 351 tran_high_t x0 = input[7];
318 int x1 = input[0]; 352 tran_high_t x1 = input[0];
319 int x2 = input[5]; 353 tran_high_t x2 = input[5];
320 int x3 = input[2]; 354 tran_high_t x3 = input[2];
321 int x4 = input[3]; 355 tran_high_t x4 = input[3];
322 int x5 = input[4]; 356 tran_high_t x5 = input[4];
323 int x6 = input[1]; 357 tran_high_t x6 = input[1];
324 int x7 = input[6]; 358 tran_high_t x7 = input[6];
325 359
326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 360 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327 output[0] = output[1] = output[2] = output[3] = output[4] 361 output[0] = output[1] = output[2] = output[3] = output[4]
328 = output[5] = output[6] = output[7] = 0; 362 = output[5] = output[6] = output[7] = 0;
329 return; 363 return;
330 } 364 }
331 365
332 // stage 1 366 // stage 1
333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
388 output[7] = -x1; 422 output[7] = -x1;
389 } 423 }
390 424
391 static const transform_2d IHT_8[] = { 425 static const transform_2d IHT_8[] = {
392 { idct8, idct8 }, // DCT_DCT = 0 426 { idct8, idct8 }, // DCT_DCT = 0
393 { iadst8, idct8 }, // ADST_DCT = 1 427 { iadst8, idct8 }, // ADST_DCT = 1
394 { idct8, iadst8 }, // DCT_ADST = 2 428 { idct8, iadst8 }, // DCT_ADST = 2
395 { iadst8, iadst8 } // ADST_ADST = 3 429 { iadst8, iadst8 } // ADST_ADST = 3
396 }; 430 };
397 431
398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, 432 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
399 int tx_type) { 433 int tx_type) {
400 int i, j; 434 int i, j;
401 int16_t out[8 * 8]; 435 tran_low_t out[8 * 8];
402 int16_t *outptr = out; 436 tran_low_t *outptr = out;
403 int16_t temp_in[8], temp_out[8]; 437 tran_low_t temp_in[8], temp_out[8];
404 const transform_2d ht = IHT_8[tx_type]; 438 const transform_2d ht = IHT_8[tx_type];
405 439
406 // inverse transform row vectors 440 // inverse transform row vectors
407 for (i = 0; i < 8; ++i) { 441 for (i = 0; i < 8; ++i) {
408 ht.rows(input, outptr); 442 ht.rows(input, outptr);
409 input += 8; 443 input += 8;
410 outptr += 8; 444 outptr += 8;
411 } 445 }
412 446
413 // inverse transform column vectors 447 // inverse transform column vectors
414 for (i = 0; i < 8; ++i) { 448 for (i = 0; i < 8; ++i) {
415 for (j = 0; j < 8; ++j) 449 for (j = 0; j < 8; ++j)
416 temp_in[j] = out[j * 8 + i]; 450 temp_in[j] = out[j * 8 + i];
417 ht.cols(temp_in, temp_out); 451 ht.cols(temp_in, temp_out);
418 for (j = 0; j < 8; ++j) 452 for (j = 0; j < 8; ++j)
419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
420 + dest[j * stride + i]); 454 + dest[j * stride + i]);
421 } 455 }
422 } 456 }
423 457
424 void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { 458 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
425 int16_t out[8 * 8] = { 0 }; 459 tran_low_t out[8 * 8] = { 0 };
426 int16_t *outptr = out; 460 tran_low_t *outptr = out;
427 int i, j; 461 int i, j;
428 int16_t temp_in[8], temp_out[8]; 462 tran_low_t temp_in[8], temp_out[8];
429 463
430 // First transform rows 464 // First transform rows
431 // only first 4 row has non-zero coefs 465 // only first 4 row has non-zero coefs
432 for (i = 0; i < 4; ++i) { 466 for (i = 0; i < 4; ++i) {
433 idct8(input, outptr); 467 idct8(input, outptr);
434 input += 8; 468 input += 8;
435 outptr += 8; 469 outptr += 8;
436 } 470 }
437 471
438 // Then transform columns 472 // Then transform columns
439 for (i = 0; i < 8; ++i) { 473 for (i = 0; i < 8; ++i) {
440 for (j = 0; j < 8; ++j) 474 for (j = 0; j < 8; ++j)
441 temp_in[j] = out[j * 8 + i]; 475 temp_in[j] = out[j * 8 + i];
442 idct8(temp_in, temp_out); 476 idct8(temp_in, temp_out);
443 for (j = 0; j < 8; ++j) 477 for (j = 0; j < 8; ++j)
444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
445 + dest[j * stride + i]); 479 + dest[j * stride + i]);
446 } 480 }
447 } 481 }
448 482
449 static void idct16(const int16_t *input, int16_t *output) { 483 static void idct16(const tran_low_t *input, tran_low_t *output) {
450 int16_t step1[16], step2[16]; 484 tran_low_t step1[16], step2[16];
451 int temp1, temp2; 485 tran_high_t temp1, temp2;
452 486
453 // stage 1 487 // stage 1
454 step1[0] = input[0/2]; 488 step1[0] = input[0/2];
455 step1[1] = input[16/2]; 489 step1[1] = input[16/2];
456 step1[2] = input[8/2]; 490 step1[2] = input[8/2];
457 step1[3] = input[24/2]; 491 step1[3] = input[24/2];
458 step1[4] = input[4/2]; 492 step1[4] = input[4/2];
459 step1[5] = input[20/2]; 493 step1[5] = input[20/2];
460 step1[6] = input[12/2]; 494 step1[6] = input[12/2];
461 step1[7] = input[28/2]; 495 step1[7] = input[28/2];
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after
604 output[8] = step2[7] - step2[8]; 638 output[8] = step2[7] - step2[8];
605 output[9] = step2[6] - step2[9]; 639 output[9] = step2[6] - step2[9];
606 output[10] = step2[5] - step2[10]; 640 output[10] = step2[5] - step2[10];
607 output[11] = step2[4] - step2[11]; 641 output[11] = step2[4] - step2[11];
608 output[12] = step2[3] - step2[12]; 642 output[12] = step2[3] - step2[12];
609 output[13] = step2[2] - step2[13]; 643 output[13] = step2[2] - step2[13];
610 output[14] = step2[1] - step2[14]; 644 output[14] = step2[1] - step2[14];
611 output[15] = step2[0] - step2[15]; 645 output[15] = step2[0] - step2[15];
612 } 646 }
613 647
614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { 648 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
615 int16_t out[16 * 16]; 649 int stride) {
616 int16_t *outptr = out; 650 tran_low_t out[16 * 16];
651 tran_low_t *outptr = out;
617 int i, j; 652 int i, j;
618 int16_t temp_in[16], temp_out[16]; 653 tran_low_t temp_in[16], temp_out[16];
619 654
620 // First transform rows 655 // First transform rows
621 for (i = 0; i < 16; ++i) { 656 for (i = 0; i < 16; ++i) {
622 idct16(input, outptr); 657 idct16(input, outptr);
623 input += 16; 658 input += 16;
624 outptr += 16; 659 outptr += 16;
625 } 660 }
626 661
627 // Then transform columns 662 // Then transform columns
628 for (i = 0; i < 16; ++i) { 663 for (i = 0; i < 16; ++i) {
629 for (j = 0; j < 16; ++j) 664 for (j = 0; j < 16; ++j)
630 temp_in[j] = out[j * 16 + i]; 665 temp_in[j] = out[j * 16 + i];
631 idct16(temp_in, temp_out); 666 idct16(temp_in, temp_out);
632 for (j = 0; j < 16; ++j) 667 for (j = 0; j < 16; ++j)
633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
634 + dest[j * stride + i]); 669 + dest[j * stride + i]);
635 } 670 }
636 } 671 }
637 672
638 static void iadst16(const int16_t *input, int16_t *output) { 673 static void iadst16(const tran_low_t *input, tran_low_t *output) {
639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
675 tran_high_t s9, s10, s11, s12, s13, s14, s15;
640 676
641 int x0 = input[15]; 677 tran_high_t x0 = input[15];
642 int x1 = input[0]; 678 tran_high_t x1 = input[0];
643 int x2 = input[13]; 679 tran_high_t x2 = input[13];
644 int x3 = input[2]; 680 tran_high_t x3 = input[2];
645 int x4 = input[11]; 681 tran_high_t x4 = input[11];
646 int x5 = input[4]; 682 tran_high_t x5 = input[4];
647 int x6 = input[9]; 683 tran_high_t x6 = input[9];
648 int x7 = input[6]; 684 tran_high_t x7 = input[6];
649 int x8 = input[7]; 685 tran_high_t x8 = input[7];
650 int x9 = input[8]; 686 tran_high_t x9 = input[8];
651 int x10 = input[5]; 687 tran_high_t x10 = input[5];
652 int x11 = input[10]; 688 tran_high_t x11 = input[10];
653 int x12 = input[3]; 689 tran_high_t x12 = input[3];
654 int x13 = input[12]; 690 tran_high_t x13 = input[12];
655 int x14 = input[1]; 691 tran_high_t x14 = input[1];
656 int x15 = input[14]; 692 tran_high_t x15 = input[14];
657 693
658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 694 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 695 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660 output[0] = output[1] = output[2] = output[3] = output[4] 696 output[0] = output[1] = output[2] = output[3] = output[4]
661 = output[5] = output[6] = output[7] = output[8] 697 = output[5] = output[6] = output[7] = output[8]
662 = output[9] = output[10] = output[11] = output[12] 698 = output[9] = output[10] = output[11] = output[12]
663 = output[13] = output[14] = output[15] = 0; 699 = output[13] = output[14] = output[15] = 0;
664 return; 700 return;
665 } 701 }
666 702
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after
806 output[15] = -x1; 842 output[15] = -x1;
807 } 843 }
808 844
809 static const transform_2d IHT_16[] = { 845 static const transform_2d IHT_16[] = {
810 { idct16, idct16 }, // DCT_DCT = 0 846 { idct16, idct16 }, // DCT_DCT = 0
811 { iadst16, idct16 }, // ADST_DCT = 1 847 { iadst16, idct16 }, // ADST_DCT = 1
812 { idct16, iadst16 }, // DCT_ADST = 2 848 { idct16, iadst16 }, // DCT_ADST = 2
813 { iadst16, iadst16 } // ADST_ADST = 3 849 { iadst16, iadst16 } // ADST_ADST = 3
814 }; 850 };
815 851
816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, 852 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
817 int tx_type) { 853 int tx_type) {
818 int i, j; 854 int i, j;
819 int16_t out[16 * 16]; 855 tran_low_t out[16 * 16];
820 int16_t *outptr = out; 856 tran_low_t *outptr = out;
821 int16_t temp_in[16], temp_out[16]; 857 tran_low_t temp_in[16], temp_out[16];
822 const transform_2d ht = IHT_16[tx_type]; 858 const transform_2d ht = IHT_16[tx_type];
823 859
824 // Rows 860 // Rows
825 for (i = 0; i < 16; ++i) { 861 for (i = 0; i < 16; ++i) {
826 ht.rows(input, outptr); 862 ht.rows(input, outptr);
827 input += 16; 863 input += 16;
828 outptr += 16; 864 outptr += 16;
829 } 865 }
830 866
831 // Columns 867 // Columns
832 for (i = 0; i < 16; ++i) { 868 for (i = 0; i < 16; ++i) {
833 for (j = 0; j < 16; ++j) 869 for (j = 0; j < 16; ++j)
834 temp_in[j] = out[j * 16 + i]; 870 temp_in[j] = out[j * 16 + i];
835 ht.cols(temp_in, temp_out); 871 ht.cols(temp_in, temp_out);
836 for (j = 0; j < 16; ++j) 872 for (j = 0; j < 16; ++j)
837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838 + dest[j * stride + i]); 874 + dest[j * stride + i]);
839 } 875 }
840 } 876 }
841 877
842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { 878 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
843 int16_t out[16 * 16] = { 0 }; 879 int stride) {
844 int16_t *outptr = out; 880 tran_low_t out[16 * 16] = { 0 };
881 tran_low_t *outptr = out;
845 int i, j; 882 int i, j;
846 int16_t temp_in[16], temp_out[16]; 883 tran_low_t temp_in[16], temp_out[16];
847 884
848 // First transform rows. Since all non-zero dct coefficients are in 885 // First transform rows. Since all non-zero dct coefficients are in
849 // upper-left 4x4 area, we only need to calculate first 4 rows here. 886 // upper-left 4x4 area, we only need to calculate first 4 rows here.
850 for (i = 0; i < 4; ++i) { 887 for (i = 0; i < 4; ++i) {
851 idct16(input, outptr); 888 idct16(input, outptr);
852 input += 16; 889 input += 16;
853 outptr += 16; 890 outptr += 16;
854 } 891 }
855 892
856 // Then transform columns 893 // Then transform columns
857 for (i = 0; i < 16; ++i) { 894 for (i = 0; i < 16; ++i) {
858 for (j = 0; j < 16; ++j) 895 for (j = 0; j < 16; ++j)
859 temp_in[j] = out[j*16 + i]; 896 temp_in[j] = out[j*16 + i];
860 idct16(temp_in, temp_out); 897 idct16(temp_in, temp_out);
861 for (j = 0; j < 16; ++j) 898 for (j = 0; j < 16; ++j)
862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
863 + dest[j * stride + i]); 900 + dest[j * stride + i]);
864 } 901 }
865 } 902 }
866 903
867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 904 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
868 int i, j; 905 int i, j;
869 int a1; 906 tran_high_t a1;
870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
871 out = dct_const_round_shift(out * cospi_16_64); 908 out = dct_const_round_shift(out * cospi_16_64);
872 a1 = ROUND_POWER_OF_TWO(out, 6); 909 a1 = ROUND_POWER_OF_TWO(out, 6);
873 for (j = 0; j < 16; ++j) { 910 for (j = 0; j < 16; ++j) {
874 for (i = 0; i < 16; ++i) 911 for (i = 0; i < 16; ++i)
875 dest[i] = clip_pixel(dest[i] + a1); 912 dest[i] = clip_pixel(dest[i] + a1);
876 dest += stride; 913 dest += stride;
877 } 914 }
878 } 915 }
879 916
880 static void idct32(const int16_t *input, int16_t *output) { 917 static void idct32(const tran_low_t *input, tran_low_t *output) {
881 int16_t step1[32], step2[32]; 918 tran_low_t step1[32], step2[32];
882 int temp1, temp2; 919 tran_high_t temp1, temp2;
883 920
884 // stage 1 921 // stage 1
885 step1[0] = input[0]; 922 step1[0] = input[0];
886 step1[1] = input[16]; 923 step1[1] = input[16];
887 step1[2] = input[8]; 924 step1[2] = input[8];
888 step1[3] = input[24]; 925 step1[3] = input[24];
889 step1[4] = input[4]; 926 step1[4] = input[4];
890 step1[5] = input[20]; 927 step1[5] = input[20];
891 step1[6] = input[12]; 928 step1[6] = input[12];
892 step1[7] = input[28]; 929 step1[7] = input[28];
(...skipping 344 matching lines...) Expand 10 before | Expand all | Expand 10 after
1237 output[24] = step1[7] - step1[24]; 1274 output[24] = step1[7] - step1[24];
1238 output[25] = step1[6] - step1[25]; 1275 output[25] = step1[6] - step1[25];
1239 output[26] = step1[5] - step1[26]; 1276 output[26] = step1[5] - step1[26];
1240 output[27] = step1[4] - step1[27]; 1277 output[27] = step1[4] - step1[27];
1241 output[28] = step1[3] - step1[28]; 1278 output[28] = step1[3] - step1[28];
1242 output[29] = step1[2] - step1[29]; 1279 output[29] = step1[2] - step1[29];
1243 output[30] = step1[1] - step1[30]; 1280 output[30] = step1[1] - step1[30];
1244 output[31] = step1[0] - step1[31]; 1281 output[31] = step1[0] - step1[31];
1245 } 1282 }
1246 1283
1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { 1284 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1248 int16_t out[32 * 32]; 1285 int stride) {
1249 int16_t *outptr = out; 1286 tran_low_t out[32 * 32];
1287 tran_low_t *outptr = out;
1250 int i, j; 1288 int i, j;
1251 int16_t temp_in[32], temp_out[32]; 1289 tran_low_t temp_in[32], temp_out[32];
1252 1290
1253 // Rows 1291 // Rows
1254 for (i = 0; i < 32; ++i) { 1292 for (i = 0; i < 32; ++i) {
1255 int16_t zero_coeff[16]; 1293 int16_t zero_coeff[16];
1256 for (j = 0; j < 16; ++j) 1294 for (j = 0; j < 16; ++j)
1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1295 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1258 for (j = 0; j < 8; ++j) 1296 for (j = 0; j < 8; ++j)
1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1297 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1260 for (j = 0; j < 4; ++j) 1298 for (j = 0; j < 4; ++j)
1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1299 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1262 for (j = 0; j < 2; ++j) 1300 for (j = 0; j < 2; ++j)
1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1301 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1264 1302
1265 if (zero_coeff[0] | zero_coeff[1]) 1303 if (zero_coeff[0] | zero_coeff[1])
1266 idct32(input, outptr); 1304 idct32(input, outptr);
1267 else 1305 else
1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32); 1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
1269 input += 32; 1307 input += 32;
1270 outptr += 32; 1308 outptr += 32;
1271 } 1309 }
1272 1310
1273 // Columns 1311 // Columns
1274 for (i = 0; i < 32; ++i) { 1312 for (i = 0; i < 32; ++i) {
1275 for (j = 0; j < 32; ++j) 1313 for (j = 0; j < 32; ++j)
1276 temp_in[j] = out[j * 32 + i]; 1314 temp_in[j] = out[j * 32 + i];
1277 idct32(temp_in, temp_out); 1315 idct32(temp_in, temp_out);
1278 for (j = 0; j < 32; ++j) 1316 for (j = 0; j < 32; ++j)
1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1280 + dest[j * stride + i]); 1318 + dest[j * stride + i]);
1281 } 1319 }
1282 } 1320 }
1283 1321
1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { 1322 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1285 int16_t out[32 * 32] = {0}; 1323 int stride) {
1286 int16_t *outptr = out; 1324 tran_low_t out[32 * 32] = {0};
1325 tran_low_t *outptr = out;
1287 int i, j; 1326 int i, j;
1288 int16_t temp_in[32], temp_out[32]; 1327 tran_low_t temp_in[32], temp_out[32];
1289 1328
1290 // Rows 1329 // Rows
1291 // only upper-left 8x8 has non-zero coeff 1330 // only upper-left 8x8 has non-zero coeff
1292 for (i = 0; i < 8; ++i) { 1331 for (i = 0; i < 8; ++i) {
1293 idct32(input, outptr); 1332 idct32(input, outptr);
1294 input += 32; 1333 input += 32;
1295 outptr += 32; 1334 outptr += 32;
1296 } 1335 }
1297 1336
1298 // Columns 1337 // Columns
1299 for (i = 0; i < 32; ++i) { 1338 for (i = 0; i < 32; ++i) {
1300 for (j = 0; j < 32; ++j) 1339 for (j = 0; j < 32; ++j)
1301 temp_in[j] = out[j * 32 + i]; 1340 temp_in[j] = out[j * 32 + i];
1302 idct32(temp_in, temp_out); 1341 idct32(temp_in, temp_out);
1303 for (j = 0; j < 32; ++j) 1342 for (j = 0; j < 32; ++j)
1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1305 + dest[j * stride + i]); 1344 + dest[j * stride + i]);
1306 } 1345 }
1307 } 1346 }
1308 1347
1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 1348 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1310 int i, j; 1349 int i, j;
1311 int a1; 1350 tran_high_t a1;
1312 1351
1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
1314 out = dct_const_round_shift(out * cospi_16_64); 1353 out = dct_const_round_shift(out * cospi_16_64);
1315 a1 = ROUND_POWER_OF_TWO(out, 6); 1354 a1 = ROUND_POWER_OF_TWO(out, 6);
1316 1355
1317 for (j = 0; j < 32; ++j) { 1356 for (j = 0; j < 32; ++j) {
1318 for (i = 0; i < 32; ++i) 1357 for (i = 0; i < 32; ++i)
1319 dest[i] = clip_pixel(dest[i] + a1); 1358 dest[i] = clip_pixel(dest[i] + a1);
1320 dest += stride; 1359 dest += stride;
1321 } 1360 }
1322 } 1361 }
1323 1362
1324 // idct 1363 // idct
1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1364 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
1365 int eob) {
1326 if (eob > 1) 1366 if (eob > 1)
1327 vp9_idct4x4_16_add(input, dest, stride); 1367 vp9_idct4x4_16_add(input, dest, stride);
1328 else 1368 else
1329 vp9_idct4x4_1_add(input, dest, stride); 1369 vp9_idct4x4_1_add(input, dest, stride);
1330 } 1370 }
1331 1371
1332 1372
1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1373 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
1374 int eob) {
1334 if (eob > 1) 1375 if (eob > 1)
1335 vp9_iwht4x4_16_add(input, dest, stride); 1376 vp9_iwht4x4_16_add(input, dest, stride);
1336 else 1377 else
1337 vp9_iwht4x4_1_add(input, dest, stride); 1378 vp9_iwht4x4_1_add(input, dest, stride);
1338 } 1379 }
1339 1380
1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1381 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
1382 int eob) {
1341 // If dc is 1, then input[0] is the reconstructed value, do not need 1383 // If dc is 1, then input[0] is the reconstructed value, do not need
1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 1384 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1343 1385
1344 // The calculation can be simplified if there are not many non-zero dct 1386 // The calculation can be simplified if there are not many non-zero dct
1345 // coefficients. Use eobs to decide what to do. 1387 // coefficients. Use eobs to decide what to do.
1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 1388 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1347 // Combine that with code here. 1389 // Combine that with code here.
1348 if (eob == 1) 1390 if (eob == 1)
1349 // DC only DCT coefficient 1391 // DC only DCT coefficient
1350 vp9_idct8x8_1_add(input, dest, stride); 1392 vp9_idct8x8_1_add(input, dest, stride);
1351 else if (eob <= 12) 1393 else if (eob <= 12)
1352 vp9_idct8x8_12_add(input, dest, stride); 1394 vp9_idct8x8_12_add(input, dest, stride);
1353 else 1395 else
1354 vp9_idct8x8_64_add(input, dest, stride); 1396 vp9_idct8x8_64_add(input, dest, stride);
1355 } 1397 }
1356 1398
1357 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, 1399 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
1358 int eob) { 1400 int eob) {
1359 /* The calculation can be simplified if there are not many non-zero dct 1401 /* The calculation can be simplified if there are not many non-zero dct
1360 * coefficients. Use eobs to separate different cases. */ 1402 * coefficients. Use eobs to separate different cases. */
1361 if (eob == 1) 1403 if (eob == 1)
1362 /* DC only DCT coefficient. */ 1404 /* DC only DCT coefficient. */
1363 vp9_idct16x16_1_add(input, dest, stride); 1405 vp9_idct16x16_1_add(input, dest, stride);
1364 else if (eob <= 10) 1406 else if (eob <= 10)
1365 vp9_idct16x16_10_add(input, dest, stride); 1407 vp9_idct16x16_10_add(input, dest, stride);
1366 else 1408 else
1367 vp9_idct16x16_256_add(input, dest, stride); 1409 vp9_idct16x16_256_add(input, dest, stride);
1368 } 1410 }
1369 1411
1370 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, 1412 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
1371 int eob) { 1413 int eob) {
1372 if (eob == 1) 1414 if (eob == 1)
1373 vp9_idct32x32_1_add(input, dest, stride); 1415 vp9_idct32x32_1_add(input, dest, stride);
1374 else if (eob <= 34) 1416 else if (eob <= 34)
1375 // non-zero coeff only in upper-left 8x8 1417 // non-zero coeff only in upper-left 8x8
1376 vp9_idct32x32_34_add(input, dest, stride); 1418 vp9_idct32x32_34_add(input, dest, stride);
1377 else 1419 else
1378 vp9_idct32x32_1024_add(input, dest, stride); 1420 vp9_idct32x32_1024_add(input, dest, stride);
1379 } 1421 }
1380 1422
1381 // iht 1423 // iht
1382 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1424 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1383 int stride, int eob) { 1425 int stride, int eob) {
1384 if (tx_type == DCT_DCT) 1426 if (tx_type == DCT_DCT)
1385 vp9_idct4x4_add(input, dest, stride, eob); 1427 vp9_idct4x4_add(input, dest, stride, eob);
1386 else 1428 else
1387 vp9_iht4x4_16_add(input, dest, stride, tx_type); 1429 vp9_iht4x4_16_add(input, dest, stride, tx_type);
1388 } 1430 }
1389 1431
1390 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1432 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1391 int stride, int eob) { 1433 int stride, int eob) {
1392 if (tx_type == DCT_DCT) { 1434 if (tx_type == DCT_DCT) {
1393 vp9_idct8x8_add(input, dest, stride, eob); 1435 vp9_idct8x8_add(input, dest, stride, eob);
1394 } else { 1436 } else {
1395 vp9_iht8x8_64_add(input, dest, stride, tx_type); 1437 vp9_iht8x8_64_add(input, dest, stride, tx_type);
1396 } 1438 }
1397 } 1439 }
1398 1440
1399 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1400 int stride, int eob) { 1442 int stride, int eob) {
1401 if (tx_type == DCT_DCT) { 1443 if (tx_type == DCT_DCT) {
1402 vp9_idct16x16_add(input, dest, stride, eob); 1444 vp9_idct16x16_add(input, dest, stride, eob);
1403 } else { 1445 } else {
1404 vp9_iht16x16_256_add(input, dest, stride, tx_type); 1446 vp9_iht16x16_256_add(input, dest, stride, tx_type);
1405 } 1447 }
1406 } 1448 }
1449
1450 #if CONFIG_VP9_HIGHBITDEPTH
1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1452 int stride, int bd) {
1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1454 0.5 shifts per pixel. */
1455 int i;
1456 tran_low_t output[16];
1457 tran_high_t a1, b1, c1, d1, e1;
1458 const tran_low_t *ip = input;
1459 tran_low_t *op = output;
1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1461
1462 for (i = 0; i < 4; i++) {
1463 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1464 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1465 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1466 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1467 a1 += c1;
1468 d1 -= b1;
1469 e1 = (a1 - d1) >> 1;
1470 b1 = e1 - b1;
1471 c1 = e1 - c1;
1472 a1 -= b1;
1473 d1 += c1;
1474 op[0] = WRAPLOW(a1);
1475 op[1] = WRAPLOW(b1);
1476 op[2] = WRAPLOW(c1);
1477 op[3] = WRAPLOW(d1);
1478 ip += 4;
1479 op += 4;
1480 }
1481
1482 ip = output;
1483 for (i = 0; i < 4; i++) {
1484 a1 = ip[4 * 0];
1485 c1 = ip[4 * 1];
1486 d1 = ip[4 * 2];
1487 b1 = ip[4 * 3];
1488 a1 += c1;
1489 d1 -= b1;
1490 e1 = (a1 - d1) >> 1;
1491 b1 = e1 - b1;
1492 c1 = e1 - c1;
1493 a1 -= b1;
1494 d1 += c1;
1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
1499
1500 ip++;
1501 dest++;
1502 }
1503 }
1504
1505 static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
1506 tran_low_t step[4];
1507 tran_high_t temp1, temp2;
1508 (void) bd;
1509 // stage 1
1510 temp1 = (input[0] + input[2]) * cospi_16_64;
1511 temp2 = (input[0] - input[2]) * cospi_16_64;
1512 step[0] = WRAPLOW(dct_const_round_shift(temp1));
1513 step[1] = WRAPLOW(dct_const_round_shift(temp2));
1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1516 step[2] = WRAPLOW(dct_const_round_shift(temp1));
1517 step[3] = WRAPLOW(dct_const_round_shift(temp2));
1518
1519 // stage 2
1520 output[0] = WRAPLOW(step[0] + step[3]);
1521 output[1] = WRAPLOW(step[1] + step[2]);
1522 output[2] = WRAPLOW(step[1] - step[2]);
1523 output[3] = WRAPLOW(step[0] - step[3]);
1524 }
1525
1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1527 int dest_stride, int bd) {
1528 int i;
1529 tran_high_t a1, e1;
1530 tran_low_t tmp[4];
1531 const tran_low_t *ip = in;
1532 tran_low_t *op = tmp;
1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1534 (void) bd;
1535
1536 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1537 e1 = a1 >> 1;
1538 a1 -= e1;
1539 op[0] = WRAPLOW(a1);
1540 op[1] = op[2] = op[3] = WRAPLOW(e1);
1541
1542 ip = tmp;
1543 for (i = 0; i < 4; i++) {
1544 e1 = ip[0] >> 1;
1545 a1 = ip[0] - e1;
1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
1550 ip++;
1551 dest++;
1552 }
1553 }
1554
1555 void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1556 int stride, int bd) {
1557 tran_low_t out[4 * 4];
1558 tran_low_t *outptr = out;
1559 int i, j;
1560 tran_low_t temp_in[4], temp_out[4];
1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1562
1563 // Rows
1564 for (i = 0; i < 4; ++i) {
1565 high_idct4(input, outptr, bd);
1566 input += 4;
1567 outptr += 4;
1568 }
1569
1570 // Columns
1571 for (i = 0; i < 4; ++i) {
1572 for (j = 0; j < 4; ++j)
1573 temp_in[j] = out[j * 4 + i];
1574 high_idct4(temp_in, temp_out, bd);
1575 for (j = 0; j < 4; ++j)
1576 dest[j * stride + i] = clip_pixel_bd_high(
1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1578 }
1579 }
1580
1581 void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1582 int dest_stride, int bd) {
1583 int i;
1584 tran_high_t a1;
1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1587
1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1589 a1 = ROUND_POWER_OF_TWO(out, 4);
1590
1591 for (i = 0; i < 4; i++) {
1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
1596 dest += dest_stride;
1597 }
1598 }
1599
1600 static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
1601 tran_low_t step1[8], step2[8];
1602 tran_high_t temp1, temp2;
1603 // stage 1
1604 step1[0] = input[0];
1605 step1[2] = input[4];
1606 step1[1] = input[2];
1607 step1[3] = input[6];
1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1616
1617 // stage 2 & stage 3 - even half
1618 high_idct4(step1, step1, bd);
1619
1620 // stage 2 - odd half
1621 step2[4] = WRAPLOW(step1[4] + step1[5]);
1622 step2[5] = WRAPLOW(step1[4] - step1[5]);
1623 step2[6] = WRAPLOW(-step1[6] + step1[7]);
1624 step2[7] = WRAPLOW(step1[6] + step1[7]);
1625
1626 // stage 3 - odd half
1627 step1[4] = step2[4];
1628 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1629 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1632 step1[7] = step2[7];
1633
1634 // stage 4
1635 output[0] = WRAPLOW(step1[0] + step1[7]);
1636 output[1] = WRAPLOW(step1[1] + step1[6]);
1637 output[2] = WRAPLOW(step1[2] + step1[5]);
1638 output[3] = WRAPLOW(step1[3] + step1[4]);
1639 output[4] = WRAPLOW(step1[3] - step1[4]);
1640 output[5] = WRAPLOW(step1[2] - step1[5]);
1641 output[6] = WRAPLOW(step1[1] - step1[6]);
1642 output[7] = WRAPLOW(step1[0] - step1[7]);
1643 }
1644
1645 void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1646 int stride, int bd) {
1647 tran_low_t out[8 * 8];
1648 tran_low_t *outptr = out;
1649 int i, j;
1650 tran_low_t temp_in[8], temp_out[8];
1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1652
1653 // First transform rows.
1654 for (i = 0; i < 8; ++i) {
1655 high_idct8(input, outptr, bd);
1656 input += 8;
1657 outptr += 8;
1658 }
1659
1660 // Then transform columns.
1661 for (i = 0; i < 8; ++i) {
1662 for (j = 0; j < 8; ++j)
1663 temp_in[j] = out[j * 8 + i];
1664 high_idct8(temp_in, temp_out, bd);
1665 for (j = 0; j < 8; ++j)
1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
1667 ROUND_POWER_OF_TWO(temp_out[j], 5),
1668 bd);
1669 }
1670 }
1671
1672 void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1673 int stride, int bd) {
1674 int i, j;
1675 tran_high_t a1;
1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1679 a1 = ROUND_POWER_OF_TWO(out, 5);
1680 for (j = 0; j < 8; ++j) {
1681 for (i = 0; i < 8; ++i)
1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
1683 dest += stride;
1684 }
1685 }
1686
1687 static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1689
1690 tran_high_t x0 = input[0];
1691 tran_high_t x1 = input[1];
1692 tran_high_t x2 = input[2];
1693 tran_high_t x3 = input[3];
1694 (void) bd;
1695
1696 if (!(x0 | x1 | x2 | x3)) {
1697 vpx_memset(output, 0, 4 * sizeof(*output));
1698 return;
1699 }
1700
1701 s0 = sinpi_1_9 * x0;
1702 s1 = sinpi_2_9 * x0;
1703 s2 = sinpi_3_9 * x1;
1704 s3 = sinpi_4_9 * x2;
1705 s4 = sinpi_1_9 * x2;
1706 s5 = sinpi_2_9 * x3;
1707 s6 = sinpi_4_9 * x3;
1708 s7 = x0 - x2 + x3;
1709
1710 x0 = s0 + s3 + s5;
1711 x1 = s1 - s4 - s6;
1712 x2 = sinpi_3_9 * s7;
1713 x3 = s2;
1714
1715 s0 = x0 + x3;
1716 s1 = x1 + x3;
1717 s2 = x2;
1718 s3 = x0 + x1 - x3;
1719
1720 // 1-D transform scaling factor is sqrt(2).
1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1722 // + 1b (addition) = 29b.
1723 // Hence the output bit depth is 15b.
1724 output[0] = WRAPLOW(dct_const_round_shift(s0));
1725 output[1] = WRAPLOW(dct_const_round_shift(s1));
1726 output[2] = WRAPLOW(dct_const_round_shift(s2));
1727 output[3] = WRAPLOW(dct_const_round_shift(s3));
1728 }
1729
1730 void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1731 int stride, int tx_type, int bd) {
1732 const high_transform_2d IHT_4[] = {
1733 { high_idct4, high_idct4 }, // DCT_DCT = 0
1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1
1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2
1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3
1737 };
1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1739
1740 int i, j;
1741 tran_low_t out[4 * 4];
1742 tran_low_t *outptr = out;
1743 tran_low_t temp_in[4], temp_out[4];
1744
1745 // Inverse transform row vectors.
1746 for (i = 0; i < 4; ++i) {
1747 IHT_4[tx_type].rows(input, outptr, bd);
1748 input += 4;
1749 outptr += 4;
1750 }
1751
1752 // Inverse transform column vectors.
1753 for (i = 0; i < 4; ++i) {
1754 for (j = 0; j < 4; ++j)
1755 temp_in[j] = out[j * 4 + i];
1756 IHT_4[tx_type].cols(temp_in, temp_out, bd);
1757 for (j = 0; j < 4; ++j)
1758 dest[j * stride + i] = clip_pixel_bd_high(
1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1760 }
1761 }
1762
1763 static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1765
1766 tran_high_t x0 = input[7];
1767 tran_high_t x1 = input[0];
1768 tran_high_t x2 = input[5];
1769 tran_high_t x3 = input[2];
1770 tran_high_t x4 = input[3];
1771 tran_high_t x5 = input[4];
1772 tran_high_t x6 = input[1];
1773 tran_high_t x7 = input[6];
1774 (void) bd;
1775
1776 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1777 vpx_memset(output, 0, 8 * sizeof(*output));
1778 return;
1779 }
1780
1781 // stage 1
1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1790
1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
1799
1800 // stage 2
1801 s0 = x0;
1802 s1 = x1;
1803 s2 = x2;
1804 s3 = x3;
1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1809
1810 x0 = s0 + s2;
1811 x1 = s1 + s3;
1812 x2 = s0 - s2;
1813 x3 = s1 - s3;
1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
1818
1819 // stage 3
1820 s2 = cospi_16_64 * (x2 + x3);
1821 s3 = cospi_16_64 * (x2 - x3);
1822 s6 = cospi_16_64 * (x6 + x7);
1823 s7 = cospi_16_64 * (x6 - x7);
1824
1825 x2 = WRAPLOW(dct_const_round_shift(s2));
1826 x3 = WRAPLOW(dct_const_round_shift(s3));
1827 x6 = WRAPLOW(dct_const_round_shift(s6));
1828 x7 = WRAPLOW(dct_const_round_shift(s7));
1829
1830 output[0] = WRAPLOW(x0);
1831 output[1] = WRAPLOW(-x4);
1832 output[2] = WRAPLOW(x6);
1833 output[3] = WRAPLOW(-x2);
1834 output[4] = WRAPLOW(x3);
1835 output[5] = WRAPLOW(-x7);
1836 output[6] = WRAPLOW(x5);
1837 output[7] = WRAPLOW(-x1);
1838 }
1839
1840 static const high_transform_2d HIGH_IHT_8[] = {
1841 { high_idct8, high_idct8 }, // DCT_DCT = 0
1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1
1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2
1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3
1845 };
1846
1847 void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1848 int stride, int tx_type, int bd) {
1849 int i, j;
1850 tran_low_t out[8 * 8];
1851 tran_low_t *outptr = out;
1852 tran_low_t temp_in[8], temp_out[8];
1853 const high_transform_2d ht = HIGH_IHT_8[tx_type];
1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1855
1856 // Inverse transform row vectors.
1857 for (i = 0; i < 8; ++i) {
1858 ht.rows(input, outptr, bd);
1859 input += 8;
1860 outptr += 8;
1861 }
1862
1863 // Inverse transform column vectors.
1864 for (i = 0; i < 8; ++i) {
1865 for (j = 0; j < 8; ++j)
1866 temp_in[j] = out[j * 8 + i];
1867 ht.cols(temp_in, temp_out, bd);
1868 for (j = 0; j < 8; ++j)
1869 dest[j * stride + i] = clip_pixel_bd_high(
1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1871 }
1872 }
1873
1874 void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1875 int stride, int bd) {
1876 tran_low_t out[8 * 8] = { 0 };
1877 tran_low_t *outptr = out;
1878 int i, j;
1879 tran_low_t temp_in[8], temp_out[8];
1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1881
1882 // First transform rows.
1883 // Only first 4 row has non-zero coefs.
1884 for (i = 0; i < 4; ++i) {
1885 high_idct8(input, outptr, bd);
1886 input += 8;
1887 outptr += 8;
1888 }
1889 // Then transform columns.
1890 for (i = 0; i < 8; ++i) {
1891 for (j = 0; j < 8; ++j)
1892 temp_in[j] = out[j * 8 + i];
1893 high_idct8(temp_in, temp_out, bd);
1894 for (j = 0; j < 8; ++j)
1895 dest[j * stride + i] = clip_pixel_bd_high(
1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1897 }
1898 }
1899
1900 static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
1901 tran_low_t step1[16], step2[16];
1902 tran_high_t temp1, temp2;
1903 (void) bd;
1904
1905 // stage 1
1906 step1[0] = input[0/2];
1907 step1[1] = input[16/2];
1908 step1[2] = input[8/2];
1909 step1[3] = input[24/2];
1910 step1[4] = input[4/2];
1911 step1[5] = input[20/2];
1912 step1[6] = input[12/2];
1913 step1[7] = input[28/2];
1914 step1[8] = input[2/2];
1915 step1[9] = input[18/2];
1916 step1[10] = input[10/2];
1917 step1[11] = input[26/2];
1918 step1[12] = input[6/2];
1919 step1[13] = input[22/2];
1920 step1[14] = input[14/2];
1921 step1[15] = input[30/2];
1922
1923 // stage 2
1924 step2[0] = step1[0];
1925 step2[1] = step1[1];
1926 step2[2] = step1[2];
1927 step2[3] = step1[3];
1928 step2[4] = step1[4];
1929 step2[5] = step1[5];
1930 step2[6] = step1[6];
1931 step2[7] = step1[7];
1932
1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
1937
1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
1942
1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1947
1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1952
1953 // stage 3
1954 step1[0] = step2[0];
1955 step1[1] = step2[1];
1956 step1[2] = step2[2];
1957 step1[3] = step2[3];
1958
1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1967
1968 step1[8] = WRAPLOW(step2[8] + step2[9]);
1969 step1[9] = WRAPLOW(step2[8] - step2[9]);
1970 step1[10] = WRAPLOW(-step2[10] + step2[11]);
1971 step1[11] = WRAPLOW(step2[10] + step2[11]);
1972 step1[12] = WRAPLOW(step2[12] + step2[13]);
1973 step1[13] = WRAPLOW(step2[12] - step2[13]);
1974 step1[14] = WRAPLOW(-step2[14] + step2[15]);
1975 step1[15] = WRAPLOW(step2[14] + step2[15]);
1976
1977 // stage 4
1978 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1979 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
1986 step2[4] = WRAPLOW(step1[4] + step1[5]);
1987 step2[5] = WRAPLOW(step1[4] - step1[5]);
1988 step2[6] = WRAPLOW(-step1[6] + step1[7]);
1989 step2[7] = WRAPLOW(step1[6] + step1[7]);
1990
1991 step2[8] = step1[8];
1992 step2[15] = step1[15];
1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
2001 step2[11] = step1[11];
2002 step2[12] = step1[12];
2003
2004 // stage 5
2005 step1[0] = WRAPLOW(step2[0] + step2[3]);
2006 step1[1] = WRAPLOW(step2[1] + step2[2]);
2007 step1[2] = WRAPLOW(step2[1] - step2[2]);
2008 step1[3] = WRAPLOW(step2[0] - step2[3]);
2009 step1[4] = step2[4];
2010 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2011 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
2014 step1[7] = step2[7];
2015
2016 step1[8] = WRAPLOW(step2[8] + step2[11]);
2017 step1[9] = WRAPLOW(step2[9] + step2[10]);
2018 step1[10] = WRAPLOW(step2[9] - step2[10]);
2019 step1[11] = WRAPLOW(step2[8] - step2[11]);
2020 step1[12] = WRAPLOW(-step2[12] + step2[15]);
2021 step1[13] = WRAPLOW(-step2[13] + step2[14]);
2022 step1[14] = WRAPLOW(step2[13] + step2[14]);
2023 step1[15] = WRAPLOW(step2[12] + step2[15]);
2024
2025 // stage 6
2026 step2[0] = WRAPLOW(step1[0] + step1[7]);
2027 step2[1] = WRAPLOW(step1[1] + step1[6]);
2028 step2[2] = WRAPLOW(step1[2] + step1[5]);
2029 step2[3] = WRAPLOW(step1[3] + step1[4]);
2030 step2[4] = WRAPLOW(step1[3] - step1[4]);
2031 step2[5] = WRAPLOW(step1[2] - step1[5]);
2032 step2[6] = WRAPLOW(step1[1] - step1[6]);
2033 step2[7] = WRAPLOW(step1[0] - step1[7]);
2034 step2[8] = step1[8];
2035 step2[9] = step1[9];
2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2037 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2041 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
2044 step2[14] = step1[14];
2045 step2[15] = step1[15];
2046
2047 // stage 7
2048 output[0] = WRAPLOW(step2[0] + step2[15]);
2049 output[1] = WRAPLOW(step2[1] + step2[14]);
2050 output[2] = WRAPLOW(step2[2] + step2[13]);
2051 output[3] = WRAPLOW(step2[3] + step2[12]);
2052 output[4] = WRAPLOW(step2[4] + step2[11]);
2053 output[5] = WRAPLOW(step2[5] + step2[10]);
2054 output[6] = WRAPLOW(step2[6] + step2[9]);
2055 output[7] = WRAPLOW(step2[7] + step2[8]);
2056 output[8] = WRAPLOW(step2[7] - step2[8]);
2057 output[9] = WRAPLOW(step2[6] - step2[9]);
2058 output[10] = WRAPLOW(step2[5] - step2[10]);
2059 output[11] = WRAPLOW(step2[4] - step2[11]);
2060 output[12] = WRAPLOW(step2[3] - step2[12]);
2061 output[13] = WRAPLOW(step2[2] - step2[13]);
2062 output[14] = WRAPLOW(step2[1] - step2[14]);
2063 output[15] = WRAPLOW(step2[0] - step2[15]);
2064 }
2065
2066 void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2067 int stride, int bd) {
2068 tran_low_t out[16 * 16];
2069 tran_low_t *outptr = out;
2070 int i, j;
2071 tran_low_t temp_in[16], temp_out[16];
2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2073
2074 // First transform rows.
2075 for (i = 0; i < 16; ++i) {
2076 high_idct16(input, outptr, bd);
2077 input += 16;
2078 outptr += 16;
2079 }
2080
2081 // Then transform columns.
2082 for (i = 0; i < 16; ++i) {
2083 for (j = 0; j < 16; ++j)
2084 temp_in[j] = out[j * 16 + i];
2085 high_idct16(temp_in, temp_out, bd);
2086 for (j = 0; j < 16; ++j)
2087 dest[j * stride + i] = clip_pixel_bd_high(
2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2089 }
2090 }
2091
2092 static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
2094 tran_high_t s9, s10, s11, s12, s13, s14, s15;
2095
2096 tran_high_t x0 = input[15];
2097 tran_high_t x1 = input[0];
2098 tran_high_t x2 = input[13];
2099 tran_high_t x3 = input[2];
2100 tran_high_t x4 = input[11];
2101 tran_high_t x5 = input[4];
2102 tran_high_t x6 = input[9];
2103 tran_high_t x7 = input[6];
2104 tran_high_t x8 = input[7];
2105 tran_high_t x9 = input[8];
2106 tran_high_t x10 = input[5];
2107 tran_high_t x11 = input[10];
2108 tran_high_t x12 = input[3];
2109 tran_high_t x13 = input[12];
2110 tran_high_t x14 = input[1];
2111 tran_high_t x15 = input[14];
2112 (void) bd;
2113
2114 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
2115 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
2116 vpx_memset(output, 0, 16 * sizeof(*output));
2117 return;
2118 }
2119
2120 // stage 1
2121 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
2122 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
2123 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
2124 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
2125 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
2126 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
2127 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
2137
2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
2154
2155 // stage 2
2156 s0 = x0;
2157 s1 = x1;
2158 s2 = x2;
2159 s3 = x3;
2160 s4 = x4;
2161 s5 = x5;
2162 s6 = x6;
2163 s7 = x7;
2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
2172
2173 x0 = WRAPLOW(s0 + s4);
2174 x1 = WRAPLOW(s1 + s5);
2175 x2 = WRAPLOW(s2 + s6);
2176 x3 = WRAPLOW(s3 + s7);
2177 x4 = WRAPLOW(s0 - s4);
2178 x5 = WRAPLOW(s1 - s5);
2179 x6 = WRAPLOW(s2 - s6);
2180 x7 = WRAPLOW(s3 - s7);
2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
2189
2190 // stage 3
2191 s0 = x0;
2192 s1 = x1;
2193 s2 = x2;
2194 s3 = x3;
2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
2199 s8 = x8;
2200 s9 = x9;
2201 s10 = x10;
2202 s11 = x11;
2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2207
2208 x0 = WRAPLOW(s0 + s2);
2209 x1 = WRAPLOW(s1 + s3);
2210 x2 = WRAPLOW(s0 - s2);
2211 x3 = WRAPLOW(s1 - s3);
2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
2216 x8 = WRAPLOW(s8 + s10);
2217 x9 = WRAPLOW(s9 + s11);
2218 x10 = WRAPLOW(s8 - s10);
2219 x11 = WRAPLOW(s9 - s11);
2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
2224
2225 // stage 4
2226 s2 = (- cospi_16_64) * (x2 + x3);
2227 s3 = cospi_16_64 * (x2 - x3);
2228 s6 = cospi_16_64 * (x6 + x7);
2229 s7 = cospi_16_64 * (-x6 + x7);
2230 s10 = cospi_16_64 * (x10 + x11);
2231 s11 = cospi_16_64 * (-x10 + x11);
2232 s14 = (- cospi_16_64) * (x14 + x15);
2233 s15 = cospi_16_64 * (x14 - x15);
2234
2235 x2 = WRAPLOW(dct_const_round_shift(s2));
2236 x3 = WRAPLOW(dct_const_round_shift(s3));
2237 x6 = WRAPLOW(dct_const_round_shift(s6));
2238 x7 = WRAPLOW(dct_const_round_shift(s7));
2239 x10 = WRAPLOW(dct_const_round_shift(s10));
2240 x11 = WRAPLOW(dct_const_round_shift(s11));
2241 x14 = WRAPLOW(dct_const_round_shift(s14));
2242 x15 = WRAPLOW(dct_const_round_shift(s15));
2243
2244 output[0] = WRAPLOW(x0);
2245 output[1] = WRAPLOW(-x8);
2246 output[2] = WRAPLOW(x12);
2247 output[3] = WRAPLOW(-x4);
2248 output[4] = WRAPLOW(x6);
2249 output[5] = WRAPLOW(x14);
2250 output[6] = WRAPLOW(x10);
2251 output[7] = WRAPLOW(x2);
2252 output[8] = WRAPLOW(x3);
2253 output[9] = WRAPLOW(x11);
2254 output[10] = WRAPLOW(x15);
2255 output[11] = WRAPLOW(x7);
2256 output[12] = WRAPLOW(x5);
2257 output[13] = WRAPLOW(-x13);
2258 output[14] = WRAPLOW(x9);
2259 output[15] = WRAPLOW(-x1);
2260 }
2261
2262 static const high_transform_2d HIGH_IHT_16[] = {
2263 { high_idct16, high_idct16 }, // DCT_DCT = 0
2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1
2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2
2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3
2267 };
2268
2269 void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2270 int stride, int tx_type, int bd) {
2271 int i, j;
2272 tran_low_t out[16 * 16];
2273 tran_low_t *outptr = out;
2274 tran_low_t temp_in[16], temp_out[16];
2275 const high_transform_2d ht = HIGH_IHT_16[tx_type];
2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2277
2278 // Rows
2279 for (i = 0; i < 16; ++i) {
2280 ht.rows(input, outptr, bd);
2281 input += 16;
2282 outptr += 16;
2283 }
2284
2285 // Columns
2286 for (i = 0; i < 16; ++i) {
2287 for (j = 0; j < 16; ++j)
2288 temp_in[j] = out[j * 16 + i];
2289 ht.cols(temp_in, temp_out, bd);
2290 for (j = 0; j < 16; ++j)
2291 dest[j * stride + i] = clip_pixel_bd_high(
2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2293 }
2294 }
2295
2296 void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2297 int stride, int bd) {
2298 tran_low_t out[16 * 16] = { 0 };
2299 tran_low_t *outptr = out;
2300 int i, j;
2301 tran_low_t temp_in[16], temp_out[16];
2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2303
2304 // First transform rows. Since all non-zero dct coefficients are in
2305 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2306 for (i = 0; i < 4; ++i) {
2307 high_idct16(input, outptr, bd);
2308 input += 16;
2309 outptr += 16;
2310 }
2311
2312 // Then transform columns.
2313 for (i = 0; i < 16; ++i) {
2314 for (j = 0; j < 16; ++j)
2315 temp_in[j] = out[j*16 + i];
2316 high_idct16(temp_in, temp_out, bd);
2317 for (j = 0; j < 16; ++j)
2318 dest[j * stride + i] = clip_pixel_bd_high(
2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2320 }
2321 }
2322
2323 void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2324 int stride, int bd) {
2325 int i, j;
2326 tran_high_t a1;
2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2329
2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
2331 a1 = ROUND_POWER_OF_TWO(out, 6);
2332 for (j = 0; j < 16; ++j) {
2333 for (i = 0; i < 16; ++i)
2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
2335 dest += stride;
2336 }
2337 }
2338
2339 static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
2340 tran_low_t step1[32], step2[32];
2341 tran_high_t temp1, temp2;
2342 (void) bd;
2343
2344 // stage 1
2345 step1[0] = input[0];
2346 step1[1] = input[16];
2347 step1[2] = input[8];
2348 step1[3] = input[24];
2349 step1[4] = input[4];
2350 step1[5] = input[20];
2351 step1[6] = input[12];
2352 step1[7] = input[28];
2353 step1[8] = input[2];
2354 step1[9] = input[18];
2355 step1[10] = input[10];
2356 step1[11] = input[26];
2357 step1[12] = input[6];
2358 step1[13] = input[22];
2359 step1[14] = input[14];
2360 step1[15] = input[30];
2361
2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
2366
2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
2371
2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
2376
2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
2381
2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
2386
2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
2391
2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
2396
2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
2401
2402 // stage 2
2403 step2[0] = step1[0];
2404 step2[1] = step1[1];
2405 step2[2] = step1[2];
2406 step2[3] = step1[3];
2407 step2[4] = step1[4];
2408 step2[5] = step1[5];
2409 step2[6] = step1[6];
2410 step2[7] = step1[7];
2411
2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
2416
2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
2421
2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
2426
2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
2431
2432 step2[16] = WRAPLOW(step1[16] + step1[17]);
2433 step2[17] = WRAPLOW(step1[16] - step1[17]);
2434 step2[18] = WRAPLOW(-step1[18] + step1[19]);
2435 step2[19] = WRAPLOW(step1[18] + step1[19]);
2436 step2[20] = WRAPLOW(step1[20] + step1[21]);
2437 step2[21] = WRAPLOW(step1[20] - step1[21]);
2438 step2[22] = WRAPLOW(-step1[22] + step1[23]);
2439 step2[23] = WRAPLOW(step1[22] + step1[23]);
2440 step2[24] = WRAPLOW(step1[24] + step1[25]);
2441 step2[25] = WRAPLOW(step1[24] - step1[25]);
2442 step2[26] = WRAPLOW(-step1[26] + step1[27]);
2443 step2[27] = WRAPLOW(step1[26] + step1[27]);
2444 step2[28] = WRAPLOW(step1[28] + step1[29]);
2445 step2[29] = WRAPLOW(step1[28] - step1[29]);
2446 step2[30] = WRAPLOW(-step1[30] + step1[31]);
2447 step2[31] = WRAPLOW(step1[30] + step1[31]);
2448
2449 // stage 3
2450 step1[0] = step2[0];
2451 step1[1] = step2[1];
2452 step1[2] = step2[2];
2453 step1[3] = step2[3];
2454
2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
2463
2464 step1[8] = WRAPLOW(step2[8] + step2[9]);
2465 step1[9] = WRAPLOW(step2[8] - step2[9]);
2466 step1[10] = WRAPLOW(-step2[10] + step2[11]);
2467 step1[11] = WRAPLOW(step2[10] + step2[11]);
2468 step1[12] = WRAPLOW(step2[12] + step2[13]);
2469 step1[13] = WRAPLOW(step2[12] - step2[13]);
2470 step1[14] = WRAPLOW(-step2[14] + step2[15]);
2471 step1[15] = WRAPLOW(step2[14] + step2[15]);
2472
2473 step1[16] = step2[16];
2474 step1[31] = step2[31];
2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
2483 step1[19] = step2[19];
2484 step1[20] = step2[20];
2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
2493 step1[23] = step2[23];
2494 step1[24] = step2[24];
2495 step1[27] = step2[27];
2496 step1[28] = step2[28];
2497
2498 // stage 4
2499 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2500 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
2507 step2[4] = WRAPLOW(step1[4] + step1[5]);
2508 step2[5] = WRAPLOW(step1[4] - step1[5]);
2509 step2[6] = WRAPLOW(-step1[6] + step1[7]);
2510 step2[7] = WRAPLOW(step1[6] + step1[7]);
2511
2512 step2[8] = step1[8];
2513 step2[15] = step1[15];
2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
2522 step2[11] = step1[11];
2523 step2[12] = step1[12];
2524
2525 step2[16] = WRAPLOW(step1[16] + step1[19]);
2526 step2[17] = WRAPLOW(step1[17] + step1[18]);
2527 step2[18] = WRAPLOW(step1[17] - step1[18]);
2528 step2[19] = WRAPLOW(step1[16] - step1[19]);
2529 step2[20] = WRAPLOW(-step1[20] + step1[23]);
2530 step2[21] = WRAPLOW(-step1[21] + step1[22]);
2531 step2[22] = WRAPLOW(step1[21] + step1[22]);
2532 step2[23] = WRAPLOW(step1[20] + step1[23]);
2533
2534 step2[24] = WRAPLOW(step1[24] + step1[27]);
2535 step2[25] = WRAPLOW(step1[25] + step1[26]);
2536 step2[26] = WRAPLOW(step1[25] - step1[26]);
2537 step2[27] = WRAPLOW(step1[24] - step1[27]);
2538 step2[28] = WRAPLOW(-step1[28] + step1[31]);
2539 step2[29] = WRAPLOW(-step1[29] + step1[30]);
2540 step2[30] = WRAPLOW(step1[29] + step1[30]);
2541 step2[31] = WRAPLOW(step1[28] + step1[31]);
2542
2543 // stage 5
2544 step1[0] = WRAPLOW(step2[0] + step2[3]);
2545 step1[1] = WRAPLOW(step2[1] + step2[2]);
2546 step1[2] = WRAPLOW(step2[1] - step2[2]);
2547 step1[3] = WRAPLOW(step2[0] - step2[3]);
2548 step1[4] = step2[4];
2549 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2550 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
2553 step1[7] = step2[7];
2554
2555 step1[8] = WRAPLOW(step2[8] + step2[11]);
2556 step1[9] = WRAPLOW(step2[9] + step2[10]);
2557 step1[10] = WRAPLOW(step2[9] - step2[10]);
2558 step1[11] = WRAPLOW(step2[8] - step2[11]);
2559 step1[12] = WRAPLOW(-step2[12] + step2[15]);
2560 step1[13] = WRAPLOW(-step2[13] + step2[14]);
2561 step1[14] = WRAPLOW(step2[13] + step2[14]);
2562 step1[15] = WRAPLOW(step2[12] + step2[15]);
2563
2564 step1[16] = step2[16];
2565 step1[17] = step2[17];
2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
2582 step1[22] = step2[22];
2583 step1[23] = step2[23];
2584 step1[24] = step2[24];
2585 step1[25] = step2[25];
2586 step1[30] = step2[30];
2587 step1[31] = step2[31];
2588
2589 // stage 6
2590 step2[0] = WRAPLOW(step1[0] + step1[7]);
2591 step2[1] = WRAPLOW(step1[1] + step1[6]);
2592 step2[2] = WRAPLOW(step1[2] + step1[5]);
2593 step2[3] = WRAPLOW(step1[3] + step1[4]);
2594 step2[4] = WRAPLOW(step1[3] - step1[4]);
2595 step2[5] = WRAPLOW(step1[2] - step1[5]);
2596 step2[6] = WRAPLOW(step1[1] - step1[6]);
2597 step2[7] = WRAPLOW(step1[0] - step1[7]);
2598 step2[8] = step1[8];
2599 step2[9] = step1[9];
2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2601 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2605 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
2608 step2[14] = WRAPLOW(step1[14]);
2609 step2[15] = WRAPLOW(step1[15]);
2610
2611 step2[16] = WRAPLOW(step1[16] + step1[23]);
2612 step2[17] = WRAPLOW(step1[17] + step1[22]);
2613 step2[18] = WRAPLOW(step1[18] + step1[21]);
2614 step2[19] = WRAPLOW(step1[19] + step1[20]);
2615 step2[20] = WRAPLOW(step1[19] - step1[20]);
2616 step2[21] = WRAPLOW(step1[18] - step1[21]);
2617 step2[22] = WRAPLOW(step1[17] - step1[22]);
2618 step2[23] = WRAPLOW(step1[16] - step1[23]);
2619
2620 step2[24] = WRAPLOW(-step1[24] + step1[31]);
2621 step2[25] = WRAPLOW(-step1[25] + step1[30]);
2622 step2[26] = WRAPLOW(-step1[26] + step1[29]);
2623 step2[27] = WRAPLOW(-step1[27] + step1[28]);
2624 step2[28] = WRAPLOW(step1[27] + step1[28]);
2625 step2[29] = WRAPLOW(step1[26] + step1[29]);
2626 step2[30] = WRAPLOW(step1[25] + step1[30]);
2627 step2[31] = WRAPLOW(step1[24] + step1[31]);
2628
2629 // stage 7
2630 step1[0] = WRAPLOW(step2[0] + step2[15]);
2631 step1[1] = WRAPLOW(step2[1] + step2[14]);
2632 step1[2] = WRAPLOW(step2[2] + step2[13]);
2633 step1[3] = WRAPLOW(step2[3] + step2[12]);
2634 step1[4] = WRAPLOW(step2[4] + step2[11]);
2635 step1[5] = WRAPLOW(step2[5] + step2[10]);
2636 step1[6] = WRAPLOW(step2[6] + step2[9]);
2637 step1[7] = WRAPLOW(step2[7] + step2[8]);
2638 step1[8] = WRAPLOW(step2[7] - step2[8]);
2639 step1[9] = WRAPLOW(step2[6] - step2[9]);
2640 step1[10] = WRAPLOW(step2[5] - step2[10]);
2641 step1[11] = WRAPLOW(step2[4] - step2[11]);
2642 step1[12] = WRAPLOW(step2[3] - step2[12]);
2643 step1[13] = WRAPLOW(step2[2] - step2[13]);
2644 step1[14] = WRAPLOW(step2[1] - step2[14]);
2645 step1[15] = WRAPLOW(step2[0] - step2[15]);
2646
2647 step1[16] = step2[16];
2648 step1[17] = step2[17];
2649 step1[18] = step2[18];
2650 step1[19] = step2[19];
2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2652 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2656 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2660 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2664 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
2667 step1[28] = step2[28];
2668 step1[29] = step2[29];
2669 step1[30] = step2[30];
2670 step1[31] = step2[31];
2671
2672 // final stage
2673 output[0] = WRAPLOW(step1[0] + step1[31]);
2674 output[1] = WRAPLOW(step1[1] + step1[30]);
2675 output[2] = WRAPLOW(step1[2] + step1[29]);
2676 output[3] = WRAPLOW(step1[3] + step1[28]);
2677 output[4] = WRAPLOW(step1[4] + step1[27]);
2678 output[5] = WRAPLOW(step1[5] + step1[26]);
2679 output[6] = WRAPLOW(step1[6] + step1[25]);
2680 output[7] = WRAPLOW(step1[7] + step1[24]);
2681 output[8] = WRAPLOW(step1[8] + step1[23]);
2682 output[9] = WRAPLOW(step1[9] + step1[22]);
2683 output[10] = WRAPLOW(step1[10] + step1[21]);
2684 output[11] = WRAPLOW(step1[11] + step1[20]);
2685 output[12] = WRAPLOW(step1[12] + step1[19]);
2686 output[13] = WRAPLOW(step1[13] + step1[18]);
2687 output[14] = WRAPLOW(step1[14] + step1[17]);
2688 output[15] = WRAPLOW(step1[15] + step1[16]);
2689 output[16] = WRAPLOW(step1[15] - step1[16]);
2690 output[17] = WRAPLOW(step1[14] - step1[17]);
2691 output[18] = WRAPLOW(step1[13] - step1[18]);
2692 output[19] = WRAPLOW(step1[12] - step1[19]);
2693 output[20] = WRAPLOW(step1[11] - step1[20]);
2694 output[21] = WRAPLOW(step1[10] - step1[21]);
2695 output[22] = WRAPLOW(step1[9] - step1[22]);
2696 output[23] = WRAPLOW(step1[8] - step1[23]);
2697 output[24] = WRAPLOW(step1[7] - step1[24]);
2698 output[25] = WRAPLOW(step1[6] - step1[25]);
2699 output[26] = WRAPLOW(step1[5] - step1[26]);
2700 output[27] = WRAPLOW(step1[4] - step1[27]);
2701 output[28] = WRAPLOW(step1[3] - step1[28]);
2702 output[29] = WRAPLOW(step1[2] - step1[29]);
2703 output[30] = WRAPLOW(step1[1] - step1[30]);
2704 output[31] = WRAPLOW(step1[0] - step1[31]);
2705 }
2706
2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2708 int stride, int bd) {
2709 tran_low_t out[32 * 32];
2710 tran_low_t *outptr = out;
2711 int i, j;
2712 tran_low_t temp_in[32], temp_out[32];
2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2714
2715 // Rows
2716 for (i = 0; i < 32; ++i) {
2717 tran_low_t zero_coeff[16];
2718 for (j = 0; j < 16; ++j)
2719 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2720 for (j = 0; j < 8; ++j)
2721 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2722 for (j = 0; j < 4; ++j)
2723 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2724 for (j = 0; j < 2; ++j)
2725 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2726
2727 if (zero_coeff[0] | zero_coeff[1])
2728 high_idct32(input, outptr, bd);
2729 else
2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
2731 input += 32;
2732 outptr += 32;
2733 }
2734
2735 // Columns
2736 for (i = 0; i < 32; ++i) {
2737 for (j = 0; j < 32; ++j)
2738 temp_in[j] = out[j * 32 + i];
2739 high_idct32(temp_in, temp_out, bd);
2740 for (j = 0; j < 32; ++j)
2741 dest[j * stride + i] = clip_pixel_bd_high(
2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2743 }
2744 }
2745
2746 void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2747 int stride, int bd) {
2748 tran_low_t out[32 * 32] = {0};
2749 tran_low_t *outptr = out;
2750 int i, j;
2751 tran_low_t temp_in[32], temp_out[32];
2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2753
2754 // Rows
2755 // Only upper-left 8x8 has non-zero coeff.
2756 for (i = 0; i < 8; ++i) {
2757 high_idct32(input, outptr, bd);
2758 input += 32;
2759 outptr += 32;
2760 }
2761 // Columns
2762 for (i = 0; i < 32; ++i) {
2763 for (j = 0; j < 32; ++j)
2764 temp_in[j] = out[j * 32 + i];
2765 high_idct32(temp_in, temp_out, bd);
2766 for (j = 0; j < 32; ++j)
2767 dest[j * stride + i] = clip_pixel_bd_high(
2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2769 }
2770 }
2771
2772 void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2773 int stride, int bd) {
2774 int i, j;
2775 int a1;
2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2777
2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
2780 a1 = ROUND_POWER_OF_TWO(out, 6);
2781
2782 for (j = 0; j < 32; ++j) {
2783 for (i = 0; i < 32; ++i)
2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
2785 dest += stride;
2786 }
2787 }
2788
2789 // idct
2790 void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2791 int eob, int bd) {
2792 if (eob > 1)
2793 vp9_high_idct4x4_16_add(input, dest, stride, bd);
2794 else
2795 vp9_high_idct4x4_1_add(input, dest, stride, bd);
2796 }
2797
2798
2799 void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2800 int eob, int bd) {
2801 if (eob > 1)
2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd);
2803 else
2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd);
2805 }
2806
2807 void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
2808 int eob, int bd) {
2809 // If dc is 1, then input[0] is the reconstructed value, do not need
2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
2811
2812 // The calculation can be simplified if there are not many non-zero dct
2813 // coefficients. Use eobs to decide what to do.
2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
2815 // Combine that with code here.
2816 // DC only DCT coefficient
2817 if (eob == 1) {
2818 vp9_high_idct8x8_1_add(input, dest, stride, bd);
2819 } else if (eob <= 10) {
2820 vp9_high_idct8x8_10_add(input, dest, stride, bd);
2821 } else {
2822 vp9_high_idct8x8_64_add(input, dest, stride, bd);
2823 }
2824 }
2825
2826 void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
2827 int eob, int bd) {
2828 // The calculation can be simplified if there are not many non-zero dct
2829 // coefficients. Use eobs to separate different cases.
2830 // DC only DCT coefficient.
2831 if (eob == 1) {
2832 vp9_high_idct16x16_1_add(input, dest, stride, bd);
2833 } else if (eob <= 10) {
2834 vp9_high_idct16x16_10_add(input, dest, stride, bd);
2835 } else {
2836 vp9_high_idct16x16_256_add(input, dest, stride, bd);
2837 }
2838 }
2839
2840 void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
2841 int eob, int bd) {
2842 // Non-zero coeff only in upper-left 8x8
2843 if (eob == 1) {
2844 vp9_high_idct32x32_1_add(input, dest, stride, bd);
2845 } else if (eob <= 34) {
2846 vp9_high_idct32x32_34_add(input, dest, stride, bd);
2847 } else {
2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd);
2849 }
2850 }
2851
2852 // iht
2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
2854 uint8_t *dest, int stride, int eob, int bd) {
2855 if (tx_type == DCT_DCT)
2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd);
2857 else
2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
2859 }
2860
2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
2862 uint8_t *dest, int stride, int eob, int bd) {
2863 if (tx_type == DCT_DCT) {
2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd);
2865 } else {
2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
2867 }
2868 }
2869
2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
2871 uint8_t *dest, int stride, int eob, int bd) {
2872 if (tx_type == DCT_DCT) {
2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd);
2874 } else {
2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
2876 }
2877 }
2878 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698