Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(224)

Side by Side Diff: source/libvpx/vp9/common/vp9_idct.c

Issue 668403002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h>
12 #include <math.h> 11 #include <math.h>
13 12
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h" 13 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_systemdependent.h" 14 #include "vp9/common/vp9_systemdependent.h"
17 #include "vp9/common/vp9_blockd.h" 15 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_idct.h" 16 #include "vp9/common/vp9_idct.h"
20 17
21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 18 #if CONFIG_EMULATE_HARDWARE
22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict 19 // When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
23 // overflow wrapping to match expected hardware implementations. 20 // non-normative method to handle overflows. A stream that causes
21 // overflows in the inverse transform is considered invalid in VP9,
22 // and a hardware implementer is free to choose any reasonable
23 // method to handle overflows. However to aid in hardware
24 // verification they can use a specific implementation of the
25 // WRAPLOW() macro below that is identical to their intended
26 // hardware implementation (and also use configure options to trigger
27 // the C-implementation of the transform).
28 //
29 // The particular WRAPLOW implementation below performs strict
30 // overflow wrapping to match common hardware implementations.
24 // bd of 8 uses trans_low with 16bits, need to remove 16bits 31 // bd of 8 uses trans_low with 16bits, need to remove 16bits
25 // bd of 10 uses trans_low with 18bits, need to remove 14bits 32 // bd of 10 uses trans_low with 18bits, need to remove 14bits
26 // bd of 12 uses trans_low with 20bits, need to remove 12bits 33 // bd of 12 uses trans_low with 20bits, need to remove 12bits
27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits 34 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd)) 35 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
29 #else 36 #else
30 #define WRAPLOW(x) (x) 37 #define WRAPLOW(x, bd) (x)
31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 38 #endif // CONFIG_EMULATE_HARDWARE
32 39
33 #if CONFIG_VP9_HIGHBITDEPTH 40 #if CONFIG_VP9_HIGHBITDEPTH
34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low, 41 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
35 tran_low_t high) { 42 int bd) {
36 return value < low ? low : (value > high ? high : value); 43 trans = WRAPLOW(trans, bd);
37 } 44 return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
38
39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
40 tran_high_t trans, int bd) {
41 trans = WRAPLOW(trans);
42 switch (bd) {
43 case 8:
44 default:
45 return clamp_high(WRAPLOW(dest + trans), 0, 255);
46 case 10:
47 return clamp_high(WRAPLOW(dest + trans), 0, 1023);
48 case 12:
49 return clamp_high(WRAPLOW(dest + trans), 0, 4095);
50 }
51 } 45 }
52 #endif // CONFIG_VP9_HIGHBITDEPTH 46 #endif // CONFIG_VP9_HIGHBITDEPTH
53 47
48 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
49 trans = WRAPLOW(trans, 8);
50 return clip_pixel(WRAPLOW(dest + trans, 8));
51 }
52
54 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 53 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 54 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
56 0.5 shifts per pixel. */ 55 0.5 shifts per pixel. */
57 int i; 56 int i;
58 tran_low_t output[16]; 57 tran_low_t output[16];
59 tran_high_t a1, b1, c1, d1, e1; 58 tran_high_t a1, b1, c1, d1, e1;
60 const tran_low_t *ip = input; 59 const tran_low_t *ip = input;
61 tran_low_t *op = output; 60 tran_low_t *op = output;
62 61
63 for (i = 0; i < 4; i++) { 62 for (i = 0; i < 4; i++) {
64 a1 = ip[0] >> UNIT_QUANT_SHIFT; 63 a1 = ip[0] >> UNIT_QUANT_SHIFT;
65 c1 = ip[1] >> UNIT_QUANT_SHIFT; 64 c1 = ip[1] >> UNIT_QUANT_SHIFT;
66 d1 = ip[2] >> UNIT_QUANT_SHIFT; 65 d1 = ip[2] >> UNIT_QUANT_SHIFT;
67 b1 = ip[3] >> UNIT_QUANT_SHIFT; 66 b1 = ip[3] >> UNIT_QUANT_SHIFT;
68 a1 += c1; 67 a1 += c1;
69 d1 -= b1; 68 d1 -= b1;
70 e1 = (a1 - d1) >> 1; 69 e1 = (a1 - d1) >> 1;
71 b1 = e1 - b1; 70 b1 = e1 - b1;
72 c1 = e1 - c1; 71 c1 = e1 - c1;
73 a1 -= b1; 72 a1 -= b1;
74 d1 += c1; 73 d1 += c1;
75 op[0] = a1; 74 op[0] = WRAPLOW(a1, 8);
76 op[1] = b1; 75 op[1] = WRAPLOW(b1, 8);
77 op[2] = c1; 76 op[2] = WRAPLOW(c1, 8);
78 op[3] = d1; 77 op[3] = WRAPLOW(d1, 8);
79 ip += 4; 78 ip += 4;
80 op += 4; 79 op += 4;
81 } 80 }
82 81
83 ip = output; 82 ip = output;
84 for (i = 0; i < 4; i++) { 83 for (i = 0; i < 4; i++) {
85 a1 = ip[4 * 0]; 84 a1 = ip[4 * 0];
86 c1 = ip[4 * 1]; 85 c1 = ip[4 * 1];
87 d1 = ip[4 * 2]; 86 d1 = ip[4 * 2];
88 b1 = ip[4 * 3]; 87 b1 = ip[4 * 3];
89 a1 += c1; 88 a1 += c1;
90 d1 -= b1; 89 d1 -= b1;
91 e1 = (a1 - d1) >> 1; 90 e1 = (a1 - d1) >> 1;
92 b1 = e1 - b1; 91 b1 = e1 - b1;
93 c1 = e1 - c1; 92 c1 = e1 - c1;
94 a1 -= b1; 93 a1 -= b1;
95 d1 += c1; 94 d1 += c1;
96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); 95 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); 96 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); 97 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); 98 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
100 99
101 ip++; 100 ip++;
102 dest++; 101 dest++;
103 } 102 }
104 } 103 }
105 104
106 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { 105 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
107 int i; 106 int i;
108 tran_high_t a1, e1; 107 tran_high_t a1, e1;
109 tran_low_t tmp[4]; 108 tran_low_t tmp[4];
110 const tran_low_t *ip = in; 109 const tran_low_t *ip = in;
111 tran_low_t *op = tmp; 110 tran_low_t *op = tmp;
112 111
113 a1 = ip[0] >> UNIT_QUANT_SHIFT; 112 a1 = ip[0] >> UNIT_QUANT_SHIFT;
114 e1 = a1 >> 1; 113 e1 = a1 >> 1;
115 a1 -= e1; 114 a1 -= e1;
116 op[0] = a1; 115 op[0] = WRAPLOW(a1, 8);
117 op[1] = op[2] = op[3] = e1; 116 op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
118 117
119 ip = tmp; 118 ip = tmp;
120 for (i = 0; i < 4; i++) { 119 for (i = 0; i < 4; i++) {
121 e1 = ip[0] >> 1; 120 e1 = ip[0] >> 1;
122 a1 = ip[0] - e1; 121 a1 = ip[0] - e1;
123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); 122 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); 123 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); 124 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); 125 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
127 ip++; 126 ip++;
128 dest++; 127 dest++;
129 } 128 }
130 } 129 }
131 130
132 static void idct4(const tran_low_t *input, tran_low_t *output) { 131 static void idct4(const tran_low_t *input, tran_low_t *output) {
133 tran_low_t step[4]; 132 tran_low_t step[4];
134 tran_high_t temp1, temp2; 133 tran_high_t temp1, temp2;
135 // stage 1 134 // stage 1
136 temp1 = (input[0] + input[2]) * cospi_16_64; 135 temp1 = (input[0] + input[2]) * cospi_16_64;
137 temp2 = (input[0] - input[2]) * cospi_16_64; 136 temp2 = (input[0] - input[2]) * cospi_16_64;
138 step[0] = dct_const_round_shift(temp1); 137 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
139 step[1] = dct_const_round_shift(temp2); 138 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 139 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 140 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
142 step[2] = dct_const_round_shift(temp1); 141 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
143 step[3] = dct_const_round_shift(temp2); 142 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
144 143
145 // stage 2 144 // stage 2
146 output[0] = step[0] + step[3]; 145 output[0] = WRAPLOW(step[0] + step[3], 8);
147 output[1] = step[1] + step[2]; 146 output[1] = WRAPLOW(step[1] + step[2], 8);
148 output[2] = step[1] - step[2]; 147 output[2] = WRAPLOW(step[1] - step[2], 8);
149 output[3] = step[0] - step[3]; 148 output[3] = WRAPLOW(step[0] - step[3], 8);
150 } 149 }
151 150
152 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 151 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
153 tran_low_t out[4 * 4]; 152 tran_low_t out[4 * 4];
154 tran_low_t *outptr = out; 153 tran_low_t *outptr = out;
155 int i, j; 154 int i, j;
156 tran_low_t temp_in[4], temp_out[4]; 155 tran_low_t temp_in[4], temp_out[4];
157 156
158 // Rows 157 // Rows
159 for (i = 0; i < 4; ++i) { 158 for (i = 0; i < 4; ++i) {
160 idct4(input, outptr); 159 idct4(input, outptr);
161 input += 4; 160 input += 4;
162 outptr += 4; 161 outptr += 4;
163 } 162 }
164 163
165 // Columns 164 // Columns
166 for (i = 0; i < 4; ++i) { 165 for (i = 0; i < 4; ++i) {
167 for (j = 0; j < 4; ++j) 166 for (j = 0; j < 4; ++j)
168 temp_in[j] = out[j * 4 + i]; 167 temp_in[j] = out[j * 4 + i];
169 idct4(temp_in, temp_out); 168 idct4(temp_in, temp_out);
170 for (j = 0; j < 4; ++j) 169 for (j = 0; j < 4; ++j) {
171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 170 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
172 + dest[j * stride + i]); 171 ROUND_POWER_OF_TWO(temp_out[j], 4));
172 }
173 } 173 }
174 } 174 }
175 175
176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, 176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
177 int dest_stride) { 177 int dest_stride) {
178 int i; 178 int i;
179 tran_high_t a1; 179 tran_high_t a1;
180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
181 out = dct_const_round_shift(out * cospi_16_64); 181 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
182 a1 = ROUND_POWER_OF_TWO(out, 4); 182 a1 = ROUND_POWER_OF_TWO(out, 4);
183 183
184 for (i = 0; i < 4; i++) { 184 for (i = 0; i < 4; i++) {
185 dest[0] = clip_pixel(dest[0] + a1); 185 dest[0] = clip_pixel_add(dest[0], a1);
186 dest[1] = clip_pixel(dest[1] + a1); 186 dest[1] = clip_pixel_add(dest[1], a1);
187 dest[2] = clip_pixel(dest[2] + a1); 187 dest[2] = clip_pixel_add(dest[2], a1);
188 dest[3] = clip_pixel(dest[3] + a1); 188 dest[3] = clip_pixel_add(dest[3], a1);
189 dest += dest_stride; 189 dest += dest_stride;
190 } 190 }
191 } 191 }
192 192
193 static void idct8(const tran_low_t *input, tran_low_t *output) { 193 static void idct8(const tran_low_t *input, tran_low_t *output) {
194 tran_low_t step1[8], step2[8]; 194 tran_low_t step1[8], step2[8];
195 tran_high_t temp1, temp2; 195 tran_high_t temp1, temp2;
196 // stage 1 196 // stage 1
197 step1[0] = input[0]; 197 step1[0] = input[0];
198 step1[2] = input[4]; 198 step1[2] = input[4];
199 step1[1] = input[2]; 199 step1[1] = input[2];
200 step1[3] = input[6]; 200 step1[3] = input[6];
201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
203 step1[4] = dct_const_round_shift(temp1); 203 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
204 step1[7] = dct_const_round_shift(temp2); 204 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
207 step1[5] = dct_const_round_shift(temp1); 207 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
208 step1[6] = dct_const_round_shift(temp2); 208 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
209 209
210 // stage 2 & stage 3 - even half 210 // stage 2 & stage 3 - even half
211 idct4(step1, step1); 211 idct4(step1, step1);
212 212
213 // stage 2 - odd half 213 // stage 2 - odd half
214 step2[4] = step1[4] + step1[5]; 214 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
215 step2[5] = step1[4] - step1[5]; 215 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
216 step2[6] = -step1[6] + step1[7]; 216 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
217 step2[7] = step1[6] + step1[7]; 217 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
218 218
219 // stage 3 -odd half 219 // stage 3 -odd half
220 step1[4] = step2[4]; 220 step1[4] = step2[4];
221 temp1 = (step2[6] - step2[5]) * cospi_16_64; 221 temp1 = (step2[6] - step2[5]) * cospi_16_64;
222 temp2 = (step2[5] + step2[6]) * cospi_16_64; 222 temp2 = (step2[5] + step2[6]) * cospi_16_64;
223 step1[5] = dct_const_round_shift(temp1); 223 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
224 step1[6] = dct_const_round_shift(temp2); 224 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
225 step1[7] = step2[7]; 225 step1[7] = step2[7];
226 226
227 // stage 4 227 // stage 4
228 output[0] = step1[0] + step1[7]; 228 output[0] = WRAPLOW(step1[0] + step1[7], 8);
229 output[1] = step1[1] + step1[6]; 229 output[1] = WRAPLOW(step1[1] + step1[6], 8);
230 output[2] = step1[2] + step1[5]; 230 output[2] = WRAPLOW(step1[2] + step1[5], 8);
231 output[3] = step1[3] + step1[4]; 231 output[3] = WRAPLOW(step1[3] + step1[4], 8);
232 output[4] = step1[3] - step1[4]; 232 output[4] = WRAPLOW(step1[3] - step1[4], 8);
233 output[5] = step1[2] - step1[5]; 233 output[5] = WRAPLOW(step1[2] - step1[5], 8);
234 output[6] = step1[1] - step1[6]; 234 output[6] = WRAPLOW(step1[1] - step1[6], 8);
235 output[7] = step1[0] - step1[7]; 235 output[7] = WRAPLOW(step1[0] - step1[7], 8);
236 } 236 }
237 237
238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
239 tran_low_t out[8 * 8]; 239 tran_low_t out[8 * 8];
240 tran_low_t *outptr = out; 240 tran_low_t *outptr = out;
241 int i, j; 241 int i, j;
242 tran_low_t temp_in[8], temp_out[8]; 242 tran_low_t temp_in[8], temp_out[8];
243 243
244 // First transform rows 244 // First transform rows
245 for (i = 0; i < 8; ++i) { 245 for (i = 0; i < 8; ++i) {
246 idct8(input, outptr); 246 idct8(input, outptr);
247 input += 8; 247 input += 8;
248 outptr += 8; 248 outptr += 8;
249 } 249 }
250 250
251 // Then transform columns 251 // Then transform columns
252 for (i = 0; i < 8; ++i) { 252 for (i = 0; i < 8; ++i) {
253 for (j = 0; j < 8; ++j) 253 for (j = 0; j < 8; ++j)
254 temp_in[j] = out[j * 8 + i]; 254 temp_in[j] = out[j * 8 + i];
255 idct8(temp_in, temp_out); 255 idct8(temp_in, temp_out);
256 for (j = 0; j < 8; ++j) 256 for (j = 0; j < 8; ++j) {
257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 257 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
258 + dest[j * stride + i]); 258 ROUND_POWER_OF_TWO(temp_out[j], 5));
259 }
259 } 260 }
260 } 261 }
261 262
262 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 263 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
263 int i, j; 264 int i, j;
264 tran_high_t a1; 265 tran_high_t a1;
265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 266 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
266 out = dct_const_round_shift(out * cospi_16_64); 267 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
267 a1 = ROUND_POWER_OF_TWO(out, 5); 268 a1 = ROUND_POWER_OF_TWO(out, 5);
268 for (j = 0; j < 8; ++j) { 269 for (j = 0; j < 8; ++j) {
269 for (i = 0; i < 8; ++i) 270 for (i = 0; i < 8; ++i)
270 dest[i] = clip_pixel(dest[i] + a1); 271 dest[i] = clip_pixel_add(dest[i], a1);
271 dest += stride; 272 dest += stride;
272 } 273 }
273 } 274 }
274 275
275 static void iadst4(const tran_low_t *input, tran_low_t *output) { 276 static void iadst4(const tran_low_t *input, tran_low_t *output) {
276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 277 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
277 278
278 tran_high_t x0 = input[0]; 279 tran_high_t x0 = input[0];
279 tran_high_t x1 = input[1]; 280 tran_high_t x1 = input[1];
280 tran_high_t x2 = input[2]; 281 tran_high_t x2 = input[2];
(...skipping 20 matching lines...) Expand all
301 302
302 s0 = x0 + x3; 303 s0 = x0 + x3;
303 s1 = x1 + x3; 304 s1 = x1 + x3;
304 s2 = x2; 305 s2 = x2;
305 s3 = x0 + x1 - x3; 306 s3 = x0 + x1 - x3;
306 307
307 // 1-D transform scaling factor is sqrt(2). 308 // 1-D transform scaling factor is sqrt(2).
308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 309 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
309 // + 1b (addition) = 29b. 310 // + 1b (addition) = 29b.
310 // Hence the output bit depth is 15b. 311 // Hence the output bit depth is 15b.
311 output[0] = dct_const_round_shift(s0); 312 output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
312 output[1] = dct_const_round_shift(s1); 313 output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
313 output[2] = dct_const_round_shift(s2); 314 output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
314 output[3] = dct_const_round_shift(s3); 315 output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
315 } 316 }
316 317
317 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, 318 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
318 int tx_type) { 319 int tx_type) {
319 const transform_2d IHT_4[] = { 320 const transform_2d IHT_4[] = {
320 { idct4, idct4 }, // DCT_DCT = 0 321 { idct4, idct4 }, // DCT_DCT = 0
321 { iadst4, idct4 }, // ADST_DCT = 1 322 { iadst4, idct4 }, // ADST_DCT = 1
322 { idct4, iadst4 }, // DCT_ADST = 2 323 { idct4, iadst4 }, // DCT_ADST = 2
323 { iadst4, iadst4 } // ADST_ADST = 3 324 { iadst4, iadst4 } // ADST_ADST = 3
324 }; 325 };
325 326
326 int i, j; 327 int i, j;
327 tran_low_t out[4 * 4]; 328 tran_low_t out[4 * 4];
328 tran_low_t *outptr = out; 329 tran_low_t *outptr = out;
329 tran_low_t temp_in[4], temp_out[4]; 330 tran_low_t temp_in[4], temp_out[4];
330 331
331 // inverse transform row vectors 332 // inverse transform row vectors
332 for (i = 0; i < 4; ++i) { 333 for (i = 0; i < 4; ++i) {
333 IHT_4[tx_type].rows(input, outptr); 334 IHT_4[tx_type].rows(input, outptr);
334 input += 4; 335 input += 4;
335 outptr += 4; 336 outptr += 4;
336 } 337 }
337 338
338 // inverse transform column vectors 339 // inverse transform column vectors
339 for (i = 0; i < 4; ++i) { 340 for (i = 0; i < 4; ++i) {
340 for (j = 0; j < 4; ++j) 341 for (j = 0; j < 4; ++j)
341 temp_in[j] = out[j * 4 + i]; 342 temp_in[j] = out[j * 4 + i];
342 IHT_4[tx_type].cols(temp_in, temp_out); 343 IHT_4[tx_type].cols(temp_in, temp_out);
343 for (j = 0; j < 4; ++j) 344 for (j = 0; j < 4; ++j) {
344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 345 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
345 + dest[j * stride + i]); 346 ROUND_POWER_OF_TWO(temp_out[j], 4));
347 }
346 } 348 }
347 } 349 }
350
348 static void iadst8(const tran_low_t *input, tran_low_t *output) { 351 static void iadst8(const tran_low_t *input, tran_low_t *output) {
349 int s0, s1, s2, s3, s4, s5, s6, s7; 352 int s0, s1, s2, s3, s4, s5, s6, s7;
350 353
351 tran_high_t x0 = input[7]; 354 tran_high_t x0 = input[7];
352 tran_high_t x1 = input[0]; 355 tran_high_t x1 = input[0];
353 tran_high_t x2 = input[5]; 356 tran_high_t x2 = input[5];
354 tran_high_t x3 = input[2]; 357 tran_high_t x3 = input[2];
355 tran_high_t x4 = input[3]; 358 tran_high_t x4 = input[3];
356 tran_high_t x5 = input[4]; 359 tran_high_t x5 = input[4];
357 tran_high_t x6 = input[1]; 360 tran_high_t x6 = input[1];
358 tran_high_t x7 = input[6]; 361 tran_high_t x7 = input[6];
359 362
360 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 363 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
361 output[0] = output[1] = output[2] = output[3] = output[4] 364 output[0] = output[1] = output[2] = output[3] = output[4]
362 = output[5] = output[6] = output[7] = 0; 365 = output[5] = output[6] = output[7] = 0;
363 return; 366 return;
364 } 367 }
365 368
366 // stage 1 369 // stage 1
367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 370 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 371 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
369 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 372 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
370 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 373 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
371 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 374 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
372 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 375 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
373 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 376 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
374 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 377 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
375 378
376 x0 = dct_const_round_shift(s0 + s4); 379 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
377 x1 = dct_const_round_shift(s1 + s5); 380 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
378 x2 = dct_const_round_shift(s2 + s6); 381 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
379 x3 = dct_const_round_shift(s3 + s7); 382 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
380 x4 = dct_const_round_shift(s0 - s4); 383 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
381 x5 = dct_const_round_shift(s1 - s5); 384 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
382 x6 = dct_const_round_shift(s2 - s6); 385 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
383 x7 = dct_const_round_shift(s3 - s7); 386 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
384 387
385 // stage 2 388 // stage 2
386 s0 = x0; 389 s0 = x0;
387 s1 = x1; 390 s1 = x1;
388 s2 = x2; 391 s2 = x2;
389 s3 = x3; 392 s3 = x3;
390 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 393 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
391 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 394 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
392 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 395 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
393 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 396 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
394 397
395 x0 = s0 + s2; 398 x0 = WRAPLOW(s0 + s2, 8);
396 x1 = s1 + s3; 399 x1 = WRAPLOW(s1 + s3, 8);
397 x2 = s0 - s2; 400 x2 = WRAPLOW(s0 - s2, 8);
398 x3 = s1 - s3; 401 x3 = WRAPLOW(s1 - s3, 8);
399 x4 = dct_const_round_shift(s4 + s6); 402 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
400 x5 = dct_const_round_shift(s5 + s7); 403 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
401 x6 = dct_const_round_shift(s4 - s6); 404 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
402 x7 = dct_const_round_shift(s5 - s7); 405 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
403 406
404 // stage 3 407 // stage 3
405 s2 = cospi_16_64 * (x2 + x3); 408 s2 = cospi_16_64 * (x2 + x3);
406 s3 = cospi_16_64 * (x2 - x3); 409 s3 = cospi_16_64 * (x2 - x3);
407 s6 = cospi_16_64 * (x6 + x7); 410 s6 = cospi_16_64 * (x6 + x7);
408 s7 = cospi_16_64 * (x6 - x7); 411 s7 = cospi_16_64 * (x6 - x7);
409 412
410 x2 = dct_const_round_shift(s2); 413 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
411 x3 = dct_const_round_shift(s3); 414 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
412 x6 = dct_const_round_shift(s6); 415 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
413 x7 = dct_const_round_shift(s7); 416 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
414 417
415 output[0] = x0; 418 output[0] = WRAPLOW(x0, 8);
416 output[1] = -x4; 419 output[1] = WRAPLOW(-x4, 8);
417 output[2] = x6; 420 output[2] = WRAPLOW(x6, 8);
418 output[3] = -x2; 421 output[3] = WRAPLOW(-x2, 8);
419 output[4] = x3; 422 output[4] = WRAPLOW(x3, 8);
420 output[5] = -x7; 423 output[5] = WRAPLOW(-x7, 8);
421 output[6] = x5; 424 output[6] = WRAPLOW(x5, 8);
422 output[7] = -x1; 425 output[7] = WRAPLOW(-x1, 8);
423 } 426 }
424 427
425 static const transform_2d IHT_8[] = { 428 static const transform_2d IHT_8[] = {
426 { idct8, idct8 }, // DCT_DCT = 0 429 { idct8, idct8 }, // DCT_DCT = 0
427 { iadst8, idct8 }, // ADST_DCT = 1 430 { iadst8, idct8 }, // ADST_DCT = 1
428 { idct8, iadst8 }, // DCT_ADST = 2 431 { idct8, iadst8 }, // DCT_ADST = 2
429 { iadst8, iadst8 } // ADST_ADST = 3 432 { iadst8, iadst8 } // ADST_ADST = 3
430 }; 433 };
431 434
432 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, 435 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
433 int tx_type) { 436 int tx_type) {
434 int i, j; 437 int i, j;
435 tran_low_t out[8 * 8]; 438 tran_low_t out[8 * 8];
436 tran_low_t *outptr = out; 439 tran_low_t *outptr = out;
437 tran_low_t temp_in[8], temp_out[8]; 440 tran_low_t temp_in[8], temp_out[8];
438 const transform_2d ht = IHT_8[tx_type]; 441 const transform_2d ht = IHT_8[tx_type];
439 442
440 // inverse transform row vectors 443 // inverse transform row vectors
441 for (i = 0; i < 8; ++i) { 444 for (i = 0; i < 8; ++i) {
442 ht.rows(input, outptr); 445 ht.rows(input, outptr);
443 input += 8; 446 input += 8;
444 outptr += 8; 447 outptr += 8;
445 } 448 }
446 449
447 // inverse transform column vectors 450 // inverse transform column vectors
448 for (i = 0; i < 8; ++i) { 451 for (i = 0; i < 8; ++i) {
449 for (j = 0; j < 8; ++j) 452 for (j = 0; j < 8; ++j)
450 temp_in[j] = out[j * 8 + i]; 453 temp_in[j] = out[j * 8 + i];
451 ht.cols(temp_in, temp_out); 454 ht.cols(temp_in, temp_out);
452 for (j = 0; j < 8; ++j) 455 for (j = 0; j < 8; ++j) {
453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 456 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
454 + dest[j * stride + i]); 457 ROUND_POWER_OF_TWO(temp_out[j], 5));
458 }
455 } 459 }
456 } 460 }
457 461
458 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 462 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
459 tran_low_t out[8 * 8] = { 0 }; 463 tran_low_t out[8 * 8] = { 0 };
460 tran_low_t *outptr = out; 464 tran_low_t *outptr = out;
461 int i, j; 465 int i, j;
462 tran_low_t temp_in[8], temp_out[8]; 466 tran_low_t temp_in[8], temp_out[8];
463 467
464 // First transform rows 468 // First transform rows
465 // only first 4 row has non-zero coefs 469 // only first 4 row has non-zero coefs
466 for (i = 0; i < 4; ++i) { 470 for (i = 0; i < 4; ++i) {
467 idct8(input, outptr); 471 idct8(input, outptr);
468 input += 8; 472 input += 8;
469 outptr += 8; 473 outptr += 8;
470 } 474 }
471 475
472 // Then transform columns 476 // Then transform columns
473 for (i = 0; i < 8; ++i) { 477 for (i = 0; i < 8; ++i) {
474 for (j = 0; j < 8; ++j) 478 for (j = 0; j < 8; ++j)
475 temp_in[j] = out[j * 8 + i]; 479 temp_in[j] = out[j * 8 + i];
476 idct8(temp_in, temp_out); 480 idct8(temp_in, temp_out);
477 for (j = 0; j < 8; ++j) 481 for (j = 0; j < 8; ++j) {
478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 482 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
479 + dest[j * stride + i]); 483 ROUND_POWER_OF_TWO(temp_out[j], 5));
484 }
480 } 485 }
481 } 486 }
482 487
483 static void idct16(const tran_low_t *input, tran_low_t *output) { 488 static void idct16(const tran_low_t *input, tran_low_t *output) {
484 tran_low_t step1[16], step2[16]; 489 tran_low_t step1[16], step2[16];
485 tran_high_t temp1, temp2; 490 tran_high_t temp1, temp2;
486 491
487 // stage 1 492 // stage 1
488 step1[0] = input[0/2]; 493 step1[0] = input[0/2];
489 step1[1] = input[16/2]; 494 step1[1] = input[16/2];
(...skipping 17 matching lines...) Expand all
507 step2[1] = step1[1]; 512 step2[1] = step1[1];
508 step2[2] = step1[2]; 513 step2[2] = step1[2];
509 step2[3] = step1[3]; 514 step2[3] = step1[3];
510 step2[4] = step1[4]; 515 step2[4] = step1[4];
511 step2[5] = step1[5]; 516 step2[5] = step1[5];
512 step2[6] = step1[6]; 517 step2[6] = step1[6];
513 step2[7] = step1[7]; 518 step2[7] = step1[7];
514 519
515 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 520 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
516 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 521 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
517 step2[8] = dct_const_round_shift(temp1); 522 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
518 step2[15] = dct_const_round_shift(temp2); 523 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
519 524
520 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 525 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
521 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 526 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
522 step2[9] = dct_const_round_shift(temp1); 527 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
523 step2[14] = dct_const_round_shift(temp2); 528 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
524 529
525 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 530 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
526 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 531 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
527 step2[10] = dct_const_round_shift(temp1); 532 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
528 step2[13] = dct_const_round_shift(temp2); 533 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
529 534
530 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 535 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
531 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 536 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
532 step2[11] = dct_const_round_shift(temp1); 537 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
533 step2[12] = dct_const_round_shift(temp2); 538 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
534 539
535 // stage 3 540 // stage 3
536 step1[0] = step2[0]; 541 step1[0] = step2[0];
537 step1[1] = step2[1]; 542 step1[1] = step2[1];
538 step1[2] = step2[2]; 543 step1[2] = step2[2];
539 step1[3] = step2[3]; 544 step1[3] = step2[3];
540 545
541 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 546 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
542 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 547 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
543 step1[4] = dct_const_round_shift(temp1); 548 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
544 step1[7] = dct_const_round_shift(temp2); 549 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
545 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 550 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
546 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 551 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
547 step1[5] = dct_const_round_shift(temp1); 552 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
548 step1[6] = dct_const_round_shift(temp2); 553 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
549 554
550 step1[8] = step2[8] + step2[9]; 555 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
551 step1[9] = step2[8] - step2[9]; 556 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
552 step1[10] = -step2[10] + step2[11]; 557 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
553 step1[11] = step2[10] + step2[11]; 558 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
554 step1[12] = step2[12] + step2[13]; 559 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
555 step1[13] = step2[12] - step2[13]; 560 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
556 step1[14] = -step2[14] + step2[15]; 561 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
557 step1[15] = step2[14] + step2[15]; 562 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
558 563
559 // stage 4 564 // stage 4
560 temp1 = (step1[0] + step1[1]) * cospi_16_64; 565 temp1 = (step1[0] + step1[1]) * cospi_16_64;
561 temp2 = (step1[0] - step1[1]) * cospi_16_64; 566 temp2 = (step1[0] - step1[1]) * cospi_16_64;
562 step2[0] = dct_const_round_shift(temp1); 567 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
563 step2[1] = dct_const_round_shift(temp2); 568 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
564 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 569 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
565 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 570 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
566 step2[2] = dct_const_round_shift(temp1); 571 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
567 step2[3] = dct_const_round_shift(temp2); 572 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
568 step2[4] = step1[4] + step1[5]; 573 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
569 step2[5] = step1[4] - step1[5]; 574 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
570 step2[6] = -step1[6] + step1[7]; 575 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
571 step2[7] = step1[6] + step1[7]; 576 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
572 577
573 step2[8] = step1[8]; 578 step2[8] = step1[8];
574 step2[15] = step1[15]; 579 step2[15] = step1[15];
575 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 580 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
576 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 581 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
577 step2[9] = dct_const_round_shift(temp1); 582 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
578 step2[14] = dct_const_round_shift(temp2); 583 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
579 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 584 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
580 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 585 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
581 step2[10] = dct_const_round_shift(temp1); 586 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
582 step2[13] = dct_const_round_shift(temp2); 587 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
583 step2[11] = step1[11]; 588 step2[11] = step1[11];
584 step2[12] = step1[12]; 589 step2[12] = step1[12];
585 590
586 // stage 5 591 // stage 5
587 step1[0] = step2[0] + step2[3]; 592 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
588 step1[1] = step2[1] + step2[2]; 593 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
589 step1[2] = step2[1] - step2[2]; 594 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
590 step1[3] = step2[0] - step2[3]; 595 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
591 step1[4] = step2[4]; 596 step1[4] = step2[4];
592 temp1 = (step2[6] - step2[5]) * cospi_16_64; 597 temp1 = (step2[6] - step2[5]) * cospi_16_64;
593 temp2 = (step2[5] + step2[6]) * cospi_16_64; 598 temp2 = (step2[5] + step2[6]) * cospi_16_64;
594 step1[5] = dct_const_round_shift(temp1); 599 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
595 step1[6] = dct_const_round_shift(temp2); 600 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
596 step1[7] = step2[7]; 601 step1[7] = step2[7];
597 602
598 step1[8] = step2[8] + step2[11]; 603 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
599 step1[9] = step2[9] + step2[10]; 604 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
600 step1[10] = step2[9] - step2[10]; 605 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
601 step1[11] = step2[8] - step2[11]; 606 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
602 step1[12] = -step2[12] + step2[15]; 607 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
603 step1[13] = -step2[13] + step2[14]; 608 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
604 step1[14] = step2[13] + step2[14]; 609 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
605 step1[15] = step2[12] + step2[15]; 610 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
606 611
607 // stage 6 612 // stage 6
608 step2[0] = step1[0] + step1[7]; 613 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
609 step2[1] = step1[1] + step1[6]; 614 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
610 step2[2] = step1[2] + step1[5]; 615 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
611 step2[3] = step1[3] + step1[4]; 616 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
612 step2[4] = step1[3] - step1[4]; 617 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
613 step2[5] = step1[2] - step1[5]; 618 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
614 step2[6] = step1[1] - step1[6]; 619 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
615 step2[7] = step1[0] - step1[7]; 620 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
616 step2[8] = step1[8]; 621 step2[8] = step1[8];
617 step2[9] = step1[9]; 622 step2[9] = step1[9];
618 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 623 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
619 temp2 = (step1[10] + step1[13]) * cospi_16_64; 624 temp2 = (step1[10] + step1[13]) * cospi_16_64;
620 step2[10] = dct_const_round_shift(temp1); 625 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
621 step2[13] = dct_const_round_shift(temp2); 626 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
622 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 627 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
623 temp2 = (step1[11] + step1[12]) * cospi_16_64; 628 temp2 = (step1[11] + step1[12]) * cospi_16_64;
624 step2[11] = dct_const_round_shift(temp1); 629 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
625 step2[12] = dct_const_round_shift(temp2); 630 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
626 step2[14] = step1[14]; 631 step2[14] = step1[14];
627 step2[15] = step1[15]; 632 step2[15] = step1[15];
628 633
629 // stage 7 634 // stage 7
630 output[0] = step2[0] + step2[15]; 635 output[0] = WRAPLOW(step2[0] + step2[15], 8);
631 output[1] = step2[1] + step2[14]; 636 output[1] = WRAPLOW(step2[1] + step2[14], 8);
632 output[2] = step2[2] + step2[13]; 637 output[2] = WRAPLOW(step2[2] + step2[13], 8);
633 output[3] = step2[3] + step2[12]; 638 output[3] = WRAPLOW(step2[3] + step2[12], 8);
634 output[4] = step2[4] + step2[11]; 639 output[4] = WRAPLOW(step2[4] + step2[11], 8);
635 output[5] = step2[5] + step2[10]; 640 output[5] = WRAPLOW(step2[5] + step2[10], 8);
636 output[6] = step2[6] + step2[9]; 641 output[6] = WRAPLOW(step2[6] + step2[9], 8);
637 output[7] = step2[7] + step2[8]; 642 output[7] = WRAPLOW(step2[7] + step2[8], 8);
638 output[8] = step2[7] - step2[8]; 643 output[8] = WRAPLOW(step2[7] - step2[8], 8);
639 output[9] = step2[6] - step2[9]; 644 output[9] = WRAPLOW(step2[6] - step2[9], 8);
640 output[10] = step2[5] - step2[10]; 645 output[10] = WRAPLOW(step2[5] - step2[10], 8);
641 output[11] = step2[4] - step2[11]; 646 output[11] = WRAPLOW(step2[4] - step2[11], 8);
642 output[12] = step2[3] - step2[12]; 647 output[12] = WRAPLOW(step2[3] - step2[12], 8);
643 output[13] = step2[2] - step2[13]; 648 output[13] = WRAPLOW(step2[2] - step2[13], 8);
644 output[14] = step2[1] - step2[14]; 649 output[14] = WRAPLOW(step2[1] - step2[14], 8);
645 output[15] = step2[0] - step2[15]; 650 output[15] = WRAPLOW(step2[0] - step2[15], 8);
646 } 651 }
647 652
648 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 653 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
649 int stride) { 654 int stride) {
650 tran_low_t out[16 * 16]; 655 tran_low_t out[16 * 16];
651 tran_low_t *outptr = out; 656 tran_low_t *outptr = out;
652 int i, j; 657 int i, j;
653 tran_low_t temp_in[16], temp_out[16]; 658 tran_low_t temp_in[16], temp_out[16];
654 659
655 // First transform rows 660 // First transform rows
656 for (i = 0; i < 16; ++i) { 661 for (i = 0; i < 16; ++i) {
657 idct16(input, outptr); 662 idct16(input, outptr);
658 input += 16; 663 input += 16;
659 outptr += 16; 664 outptr += 16;
660 } 665 }
661 666
662 // Then transform columns 667 // Then transform columns
663 for (i = 0; i < 16; ++i) { 668 for (i = 0; i < 16; ++i) {
664 for (j = 0; j < 16; ++j) 669 for (j = 0; j < 16; ++j)
665 temp_in[j] = out[j * 16 + i]; 670 temp_in[j] = out[j * 16 + i];
666 idct16(temp_in, temp_out); 671 idct16(temp_in, temp_out);
667 for (j = 0; j < 16; ++j) 672 for (j = 0; j < 16; ++j) {
668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 673 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
669 + dest[j * stride + i]); 674 ROUND_POWER_OF_TWO(temp_out[j], 6));
675 }
670 } 676 }
671 } 677 }
672 678
673 static void iadst16(const tran_low_t *input, tran_low_t *output) { 679 static void iadst16(const tran_low_t *input, tran_low_t *output) {
674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 680 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
675 tran_high_t s9, s10, s11, s12, s13, s14, s15; 681 tran_high_t s9, s10, s11, s12, s13, s14, s15;
676 682
677 tran_high_t x0 = input[15]; 683 tran_high_t x0 = input[15];
678 tran_high_t x1 = input[0]; 684 tran_high_t x1 = input[0];
679 tran_high_t x2 = input[13]; 685 tran_high_t x2 = input[13];
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
711 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 717 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
712 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 718 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
713 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 719 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
714 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 720 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
715 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 721 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
716 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 722 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
717 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 723 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
718 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 724 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
719 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 725 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
720 726
721 x0 = dct_const_round_shift(s0 + s8); 727 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
722 x1 = dct_const_round_shift(s1 + s9); 728 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
723 x2 = dct_const_round_shift(s2 + s10); 729 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
724 x3 = dct_const_round_shift(s3 + s11); 730 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
725 x4 = dct_const_round_shift(s4 + s12); 731 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
726 x5 = dct_const_round_shift(s5 + s13); 732 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
727 x6 = dct_const_round_shift(s6 + s14); 733 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
728 x7 = dct_const_round_shift(s7 + s15); 734 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
729 x8 = dct_const_round_shift(s0 - s8); 735 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
730 x9 = dct_const_round_shift(s1 - s9); 736 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
731 x10 = dct_const_round_shift(s2 - s10); 737 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
732 x11 = dct_const_round_shift(s3 - s11); 738 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
733 x12 = dct_const_round_shift(s4 - s12); 739 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
734 x13 = dct_const_round_shift(s5 - s13); 740 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
735 x14 = dct_const_round_shift(s6 - s14); 741 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
736 x15 = dct_const_round_shift(s7 - s15); 742 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
737 743
738 // stage 2 744 // stage 2
739 s0 = x0; 745 s0 = x0;
740 s1 = x1; 746 s1 = x1;
741 s2 = x2; 747 s2 = x2;
742 s3 = x3; 748 s3 = x3;
743 s4 = x4; 749 s4 = x4;
744 s5 = x5; 750 s5 = x5;
745 s6 = x6; 751 s6 = x6;
746 s7 = x7; 752 s7 = x7;
747 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 753 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
748 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 754 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
749 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 755 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
750 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 756 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
751 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 757 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
752 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 758 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
753 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 759 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
754 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 760 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
755 761
756 x0 = s0 + s4; 762 x0 = WRAPLOW(s0 + s4, 8);
757 x1 = s1 + s5; 763 x1 = WRAPLOW(s1 + s5, 8);
758 x2 = s2 + s6; 764 x2 = WRAPLOW(s2 + s6, 8);
759 x3 = s3 + s7; 765 x3 = WRAPLOW(s3 + s7, 8);
760 x4 = s0 - s4; 766 x4 = WRAPLOW(s0 - s4, 8);
761 x5 = s1 - s5; 767 x5 = WRAPLOW(s1 - s5, 8);
762 x6 = s2 - s6; 768 x6 = WRAPLOW(s2 - s6, 8);
763 x7 = s3 - s7; 769 x7 = WRAPLOW(s3 - s7, 8);
764 x8 = dct_const_round_shift(s8 + s12); 770 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
765 x9 = dct_const_round_shift(s9 + s13); 771 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
766 x10 = dct_const_round_shift(s10 + s14); 772 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
767 x11 = dct_const_round_shift(s11 + s15); 773 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
768 x12 = dct_const_round_shift(s8 - s12); 774 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
769 x13 = dct_const_round_shift(s9 - s13); 775 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
770 x14 = dct_const_round_shift(s10 - s14); 776 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
771 x15 = dct_const_round_shift(s11 - s15); 777 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
772 778
773 // stage 3 779 // stage 3
774 s0 = x0; 780 s0 = x0;
775 s1 = x1; 781 s1 = x1;
776 s2 = x2; 782 s2 = x2;
777 s3 = x3; 783 s3 = x3;
778 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 784 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
779 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 785 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
780 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 786 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
781 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 787 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
782 s8 = x8; 788 s8 = x8;
783 s9 = x9; 789 s9 = x9;
784 s10 = x10; 790 s10 = x10;
785 s11 = x11; 791 s11 = x11;
786 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 792 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
787 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 793 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
788 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 794 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
789 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 795 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
790 796
791 x0 = s0 + s2; 797 x0 = WRAPLOW(check_range(s0 + s2), 8);
792 x1 = s1 + s3; 798 x1 = WRAPLOW(check_range(s1 + s3), 8);
793 x2 = s0 - s2; 799 x2 = WRAPLOW(check_range(s0 - s2), 8);
794 x3 = s1 - s3; 800 x3 = WRAPLOW(check_range(s1 - s3), 8);
795 x4 = dct_const_round_shift(s4 + s6); 801 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
796 x5 = dct_const_round_shift(s5 + s7); 802 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
797 x6 = dct_const_round_shift(s4 - s6); 803 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
798 x7 = dct_const_round_shift(s5 - s7); 804 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
799 x8 = s8 + s10; 805 x8 = WRAPLOW(check_range(s8 + s10), 8);
800 x9 = s9 + s11; 806 x9 = WRAPLOW(check_range(s9 + s11), 8);
801 x10 = s8 - s10; 807 x10 = WRAPLOW(check_range(s8 - s10), 8);
802 x11 = s9 - s11; 808 x11 = WRAPLOW(check_range(s9 - s11), 8);
803 x12 = dct_const_round_shift(s12 + s14); 809 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
804 x13 = dct_const_round_shift(s13 + s15); 810 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
805 x14 = dct_const_round_shift(s12 - s14); 811 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
806 x15 = dct_const_round_shift(s13 - s15); 812 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
807 813
808 // stage 4 814 // stage 4
809 s2 = (- cospi_16_64) * (x2 + x3); 815 s2 = (- cospi_16_64) * (x2 + x3);
810 s3 = cospi_16_64 * (x2 - x3); 816 s3 = cospi_16_64 * (x2 - x3);
811 s6 = cospi_16_64 * (x6 + x7); 817 s6 = cospi_16_64 * (x6 + x7);
812 s7 = cospi_16_64 * (- x6 + x7); 818 s7 = cospi_16_64 * (- x6 + x7);
813 s10 = cospi_16_64 * (x10 + x11); 819 s10 = cospi_16_64 * (x10 + x11);
814 s11 = cospi_16_64 * (- x10 + x11); 820 s11 = cospi_16_64 * (- x10 + x11);
815 s14 = (- cospi_16_64) * (x14 + x15); 821 s14 = (- cospi_16_64) * (x14 + x15);
816 s15 = cospi_16_64 * (x14 - x15); 822 s15 = cospi_16_64 * (x14 - x15);
817 823
818 x2 = dct_const_round_shift(s2); 824 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
819 x3 = dct_const_round_shift(s3); 825 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
820 x6 = dct_const_round_shift(s6); 826 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
821 x7 = dct_const_round_shift(s7); 827 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
822 x10 = dct_const_round_shift(s10); 828 x10 = WRAPLOW(dct_const_round_shift(s10), 8);
823 x11 = dct_const_round_shift(s11); 829 x11 = WRAPLOW(dct_const_round_shift(s11), 8);
824 x14 = dct_const_round_shift(s14); 830 x14 = WRAPLOW(dct_const_round_shift(s14), 8);
825 x15 = dct_const_round_shift(s15); 831 x15 = WRAPLOW(dct_const_round_shift(s15), 8);
826 832
827 output[0] = x0; 833 output[0] = WRAPLOW(x0, 8);
828 output[1] = -x8; 834 output[1] = WRAPLOW(-x8, 8);
829 output[2] = x12; 835 output[2] = WRAPLOW(x12, 8);
830 output[3] = -x4; 836 output[3] = WRAPLOW(-x4, 8);
831 output[4] = x6; 837 output[4] = WRAPLOW(x6, 8);
832 output[5] = x14; 838 output[5] = WRAPLOW(x14, 8);
833 output[6] = x10; 839 output[6] = WRAPLOW(x10, 8);
834 output[7] = x2; 840 output[7] = WRAPLOW(x2, 8);
835 output[8] = x3; 841 output[8] = WRAPLOW(x3, 8);
836 output[9] = x11; 842 output[9] = WRAPLOW(x11, 8);
837 output[10] = x15; 843 output[10] = WRAPLOW(x15, 8);
838 output[11] = x7; 844 output[11] = WRAPLOW(x7, 8);
839 output[12] = x5; 845 output[12] = WRAPLOW(x5, 8);
840 output[13] = -x13; 846 output[13] = WRAPLOW(-x13, 8);
841 output[14] = x9; 847 output[14] = WRAPLOW(x9, 8);
842 output[15] = -x1; 848 output[15] = WRAPLOW(-x1, 8);
843 } 849 }
844 850
845 static const transform_2d IHT_16[] = { 851 static const transform_2d IHT_16[] = {
846 { idct16, idct16 }, // DCT_DCT = 0 852 { idct16, idct16 }, // DCT_DCT = 0
847 { iadst16, idct16 }, // ADST_DCT = 1 853 { iadst16, idct16 }, // ADST_DCT = 1
848 { idct16, iadst16 }, // DCT_ADST = 2 854 { idct16, iadst16 }, // DCT_ADST = 2
849 { iadst16, iadst16 } // ADST_ADST = 3 855 { iadst16, iadst16 } // ADST_ADST = 3
850 }; 856 };
851 857
852 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, 858 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
853 int tx_type) { 859 int tx_type) {
854 int i, j; 860 int i, j;
855 tran_low_t out[16 * 16]; 861 tran_low_t out[16 * 16];
856 tran_low_t *outptr = out; 862 tran_low_t *outptr = out;
857 tran_low_t temp_in[16], temp_out[16]; 863 tran_low_t temp_in[16], temp_out[16];
858 const transform_2d ht = IHT_16[tx_type]; 864 const transform_2d ht = IHT_16[tx_type];
859 865
860 // Rows 866 // Rows
861 for (i = 0; i < 16; ++i) { 867 for (i = 0; i < 16; ++i) {
862 ht.rows(input, outptr); 868 ht.rows(input, outptr);
863 input += 16; 869 input += 16;
864 outptr += 16; 870 outptr += 16;
865 } 871 }
866 872
867 // Columns 873 // Columns
868 for (i = 0; i < 16; ++i) { 874 for (i = 0; i < 16; ++i) {
869 for (j = 0; j < 16; ++j) 875 for (j = 0; j < 16; ++j)
870 temp_in[j] = out[j * 16 + i]; 876 temp_in[j] = out[j * 16 + i];
871 ht.cols(temp_in, temp_out); 877 ht.cols(temp_in, temp_out);
872 for (j = 0; j < 16; ++j) 878 for (j = 0; j < 16; ++j) {
873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 879 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
874 + dest[j * stride + i]); 880 ROUND_POWER_OF_TWO(temp_out[j], 6));
881 }
875 } 882 }
876 } 883 }
877 884
878 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 885 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
879 int stride) { 886 int stride) {
880 tran_low_t out[16 * 16] = { 0 }; 887 tran_low_t out[16 * 16] = { 0 };
881 tran_low_t *outptr = out; 888 tran_low_t *outptr = out;
882 int i, j; 889 int i, j;
883 tran_low_t temp_in[16], temp_out[16]; 890 tran_low_t temp_in[16], temp_out[16];
884 891
885 // First transform rows. Since all non-zero dct coefficients are in 892 // First transform rows. Since all non-zero dct coefficients are in
886 // upper-left 4x4 area, we only need to calculate first 4 rows here. 893 // upper-left 4x4 area, we only need to calculate first 4 rows here.
887 for (i = 0; i < 4; ++i) { 894 for (i = 0; i < 4; ++i) {
888 idct16(input, outptr); 895 idct16(input, outptr);
889 input += 16; 896 input += 16;
890 outptr += 16; 897 outptr += 16;
891 } 898 }
892 899
893 // Then transform columns 900 // Then transform columns
894 for (i = 0; i < 16; ++i) { 901 for (i = 0; i < 16; ++i) {
895 for (j = 0; j < 16; ++j) 902 for (j = 0; j < 16; ++j)
896 temp_in[j] = out[j*16 + i]; 903 temp_in[j] = out[j*16 + i];
897 idct16(temp_in, temp_out); 904 idct16(temp_in, temp_out);
898 for (j = 0; j < 16; ++j) 905 for (j = 0; j < 16; ++j) {
899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 906 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
900 + dest[j * stride + i]); 907 ROUND_POWER_OF_TWO(temp_out[j], 6));
908 }
901 } 909 }
902 } 910 }
903 911
904 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 912 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
905 int i, j; 913 int i, j;
906 tran_high_t a1; 914 tran_high_t a1;
907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 915 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
908 out = dct_const_round_shift(out * cospi_16_64); 916 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
909 a1 = ROUND_POWER_OF_TWO(out, 6); 917 a1 = ROUND_POWER_OF_TWO(out, 6);
910 for (j = 0; j < 16; ++j) { 918 for (j = 0; j < 16; ++j) {
911 for (i = 0; i < 16; ++i) 919 for (i = 0; i < 16; ++i)
912 dest[i] = clip_pixel(dest[i] + a1); 920 dest[i] = clip_pixel_add(dest[i], a1);
913 dest += stride; 921 dest += stride;
914 } 922 }
915 } 923 }
916 924
917 static void idct32(const tran_low_t *input, tran_low_t *output) { 925 static void idct32(const tran_low_t *input, tran_low_t *output) {
918 tran_low_t step1[32], step2[32]; 926 tran_low_t step1[32], step2[32];
919 tran_high_t temp1, temp2; 927 tran_high_t temp1, temp2;
920 928
921 // stage 1 929 // stage 1
922 step1[0] = input[0]; 930 step1[0] = input[0];
923 step1[1] = input[16]; 931 step1[1] = input[16];
924 step1[2] = input[8]; 932 step1[2] = input[8];
925 step1[3] = input[24]; 933 step1[3] = input[24];
926 step1[4] = input[4]; 934 step1[4] = input[4];
927 step1[5] = input[20]; 935 step1[5] = input[20];
928 step1[6] = input[12]; 936 step1[6] = input[12];
929 step1[7] = input[28]; 937 step1[7] = input[28];
930 step1[8] = input[2]; 938 step1[8] = input[2];
931 step1[9] = input[18]; 939 step1[9] = input[18];
932 step1[10] = input[10]; 940 step1[10] = input[10];
933 step1[11] = input[26]; 941 step1[11] = input[26];
934 step1[12] = input[6]; 942 step1[12] = input[6];
935 step1[13] = input[22]; 943 step1[13] = input[22];
936 step1[14] = input[14]; 944 step1[14] = input[14];
937 step1[15] = input[30]; 945 step1[15] = input[30];
938 946
939 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 947 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
940 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 948 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
941 step1[16] = dct_const_round_shift(temp1); 949 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
942 step1[31] = dct_const_round_shift(temp2); 950 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
943 951
944 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 952 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
945 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 953 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
946 step1[17] = dct_const_round_shift(temp1); 954 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
947 step1[30] = dct_const_round_shift(temp2); 955 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
948 956
949 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 957 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
950 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 958 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
951 step1[18] = dct_const_round_shift(temp1); 959 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
952 step1[29] = dct_const_round_shift(temp2); 960 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
953 961
954 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 962 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
955 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 963 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
956 step1[19] = dct_const_round_shift(temp1); 964 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
957 step1[28] = dct_const_round_shift(temp2); 965 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
958 966
959 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 967 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
960 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 968 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
961 step1[20] = dct_const_round_shift(temp1); 969 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
962 step1[27] = dct_const_round_shift(temp2); 970 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
963 971
964 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 972 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
965 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 973 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
966 step1[21] = dct_const_round_shift(temp1); 974 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
967 step1[26] = dct_const_round_shift(temp2); 975 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
968 976
969 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 977 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
970 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 978 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
971 step1[22] = dct_const_round_shift(temp1); 979 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
972 step1[25] = dct_const_round_shift(temp2); 980 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
973 981
974 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 982 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
975 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 983 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
976 step1[23] = dct_const_round_shift(temp1); 984 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
977 step1[24] = dct_const_round_shift(temp2); 985 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
978 986
979 // stage 2 987 // stage 2
980 step2[0] = step1[0]; 988 step2[0] = step1[0];
981 step2[1] = step1[1]; 989 step2[1] = step1[1];
982 step2[2] = step1[2]; 990 step2[2] = step1[2];
983 step2[3] = step1[3]; 991 step2[3] = step1[3];
984 step2[4] = step1[4]; 992 step2[4] = step1[4];
985 step2[5] = step1[5]; 993 step2[5] = step1[5];
986 step2[6] = step1[6]; 994 step2[6] = step1[6];
987 step2[7] = step1[7]; 995 step2[7] = step1[7];
988 996
989 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 997 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
990 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 998 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
991 step2[8] = dct_const_round_shift(temp1); 999 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
992 step2[15] = dct_const_round_shift(temp2); 1000 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
993 1001
994 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1002 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
995 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1003 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
996 step2[9] = dct_const_round_shift(temp1); 1004 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
997 step2[14] = dct_const_round_shift(temp2); 1005 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
998 1006
999 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1007 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1000 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1008 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1001 step2[10] = dct_const_round_shift(temp1); 1009 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1002 step2[13] = dct_const_round_shift(temp2); 1010 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1003 1011
1004 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1012 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1005 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1013 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1006 step2[11] = dct_const_round_shift(temp1); 1014 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1007 step2[12] = dct_const_round_shift(temp2); 1015 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1008 1016
1009 step2[16] = step1[16] + step1[17]; 1017 step2[16] = WRAPLOW(step1[16] + step1[17], 8);
1010 step2[17] = step1[16] - step1[17]; 1018 step2[17] = WRAPLOW(step1[16] - step1[17], 8);
1011 step2[18] = -step1[18] + step1[19]; 1019 step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
1012 step2[19] = step1[18] + step1[19]; 1020 step2[19] = WRAPLOW(step1[18] + step1[19], 8);
1013 step2[20] = step1[20] + step1[21]; 1021 step2[20] = WRAPLOW(step1[20] + step1[21], 8);
1014 step2[21] = step1[20] - step1[21]; 1022 step2[21] = WRAPLOW(step1[20] - step1[21], 8);
1015 step2[22] = -step1[22] + step1[23]; 1023 step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
1016 step2[23] = step1[22] + step1[23]; 1024 step2[23] = WRAPLOW(step1[22] + step1[23], 8);
1017 step2[24] = step1[24] + step1[25]; 1025 step2[24] = WRAPLOW(step1[24] + step1[25], 8);
1018 step2[25] = step1[24] - step1[25]; 1026 step2[25] = WRAPLOW(step1[24] - step1[25], 8);
1019 step2[26] = -step1[26] + step1[27]; 1027 step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
1020 step2[27] = step1[26] + step1[27]; 1028 step2[27] = WRAPLOW(step1[26] + step1[27], 8);
1021 step2[28] = step1[28] + step1[29]; 1029 step2[28] = WRAPLOW(step1[28] + step1[29], 8);
1022 step2[29] = step1[28] - step1[29]; 1030 step2[29] = WRAPLOW(step1[28] - step1[29], 8);
1023 step2[30] = -step1[30] + step1[31]; 1031 step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
1024 step2[31] = step1[30] + step1[31]; 1032 step2[31] = WRAPLOW(step1[30] + step1[31], 8);
1025 1033
1026 // stage 3 1034 // stage 3
1027 step1[0] = step2[0]; 1035 step1[0] = step2[0];
1028 step1[1] = step2[1]; 1036 step1[1] = step2[1];
1029 step1[2] = step2[2]; 1037 step1[2] = step2[2];
1030 step1[3] = step2[3]; 1038 step1[3] = step2[3];
1031 1039
1032 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1040 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1033 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1041 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1034 step1[4] = dct_const_round_shift(temp1); 1042 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
1035 step1[7] = dct_const_round_shift(temp2); 1043 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
1036 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1044 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1037 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1045 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1038 step1[5] = dct_const_round_shift(temp1); 1046 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1039 step1[6] = dct_const_round_shift(temp2); 1047 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1040 1048
1041 step1[8] = step2[8] + step2[9]; 1049 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
1042 step1[9] = step2[8] - step2[9]; 1050 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
1043 step1[10] = -step2[10] + step2[11]; 1051 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
1044 step1[11] = step2[10] + step2[11]; 1052 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
1045 step1[12] = step2[12] + step2[13]; 1053 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
1046 step1[13] = step2[12] - step2[13]; 1054 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
1047 step1[14] = -step2[14] + step2[15]; 1055 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
1048 step1[15] = step2[14] + step2[15]; 1056 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
1049 1057
1050 step1[16] = step2[16]; 1058 step1[16] = step2[16];
1051 step1[31] = step2[31]; 1059 step1[31] = step2[31];
1052 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 1060 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1053 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 1061 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1054 step1[17] = dct_const_round_shift(temp1); 1062 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
1055 step1[30] = dct_const_round_shift(temp2); 1063 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
1056 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 1064 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1057 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 1065 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1058 step1[18] = dct_const_round_shift(temp1); 1066 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1059 step1[29] = dct_const_round_shift(temp2); 1067 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1060 step1[19] = step2[19]; 1068 step1[19] = step2[19];
1061 step1[20] = step2[20]; 1069 step1[20] = step2[20];
1062 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 1070 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1063 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 1071 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1064 step1[21] = dct_const_round_shift(temp1); 1072 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1065 step1[26] = dct_const_round_shift(temp2); 1073 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1066 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 1074 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1067 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 1075 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1068 step1[22] = dct_const_round_shift(temp1); 1076 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1069 step1[25] = dct_const_round_shift(temp2); 1077 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1070 step1[23] = step2[23]; 1078 step1[23] = step2[23];
1071 step1[24] = step2[24]; 1079 step1[24] = step2[24];
1072 step1[27] = step2[27]; 1080 step1[27] = step2[27];
1073 step1[28] = step2[28]; 1081 step1[28] = step2[28];
1074 1082
1075 // stage 4 1083 // stage 4
1076 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1084 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1077 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1085 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1078 step2[0] = dct_const_round_shift(temp1); 1086 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
1079 step2[1] = dct_const_round_shift(temp2); 1087 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
1080 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1088 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1081 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1089 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1082 step2[2] = dct_const_round_shift(temp1); 1090 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
1083 step2[3] = dct_const_round_shift(temp2); 1091 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
1084 step2[4] = step1[4] + step1[5]; 1092 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
1085 step2[5] = step1[4] - step1[5]; 1093 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
1086 step2[6] = -step1[6] + step1[7]; 1094 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
1087 step2[7] = step1[6] + step1[7]; 1095 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
1088 1096
1089 step2[8] = step1[8]; 1097 step2[8] = step1[8];
1090 step2[15] = step1[15]; 1098 step2[15] = step1[15];
1091 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1099 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1092 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1100 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1093 step2[9] = dct_const_round_shift(temp1); 1101 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
1094 step2[14] = dct_const_round_shift(temp2); 1102 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
1095 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1103 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1096 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1104 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1097 step2[10] = dct_const_round_shift(temp1); 1105 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1098 step2[13] = dct_const_round_shift(temp2); 1106 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1099 step2[11] = step1[11]; 1107 step2[11] = step1[11];
1100 step2[12] = step1[12]; 1108 step2[12] = step1[12];
1101 1109
1102 step2[16] = step1[16] + step1[19]; 1110 step2[16] = WRAPLOW(step1[16] + step1[19], 8);
1103 step2[17] = step1[17] + step1[18]; 1111 step2[17] = WRAPLOW(step1[17] + step1[18], 8);
1104 step2[18] = step1[17] - step1[18]; 1112 step2[18] = WRAPLOW(step1[17] - step1[18], 8);
1105 step2[19] = step1[16] - step1[19]; 1113 step2[19] = WRAPLOW(step1[16] - step1[19], 8);
1106 step2[20] = -step1[20] + step1[23]; 1114 step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
1107 step2[21] = -step1[21] + step1[22]; 1115 step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
1108 step2[22] = step1[21] + step1[22]; 1116 step2[22] = WRAPLOW(step1[21] + step1[22], 8);
1109 step2[23] = step1[20] + step1[23]; 1117 step2[23] = WRAPLOW(step1[20] + step1[23], 8);
1110 1118
1111 step2[24] = step1[24] + step1[27]; 1119 step2[24] = WRAPLOW(step1[24] + step1[27], 8);
1112 step2[25] = step1[25] + step1[26]; 1120 step2[25] = WRAPLOW(step1[25] + step1[26], 8);
1113 step2[26] = step1[25] - step1[26]; 1121 step2[26] = WRAPLOW(step1[25] - step1[26], 8);
1114 step2[27] = step1[24] - step1[27]; 1122 step2[27] = WRAPLOW(step1[24] - step1[27], 8);
1115 step2[28] = -step1[28] + step1[31]; 1123 step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
1116 step2[29] = -step1[29] + step1[30]; 1124 step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
1117 step2[30] = step1[29] + step1[30]; 1125 step2[30] = WRAPLOW(step1[29] + step1[30], 8);
1118 step2[31] = step1[28] + step1[31]; 1126 step2[31] = WRAPLOW(step1[28] + step1[31], 8);
1119 1127
1120 // stage 5 1128 // stage 5
1121 step1[0] = step2[0] + step2[3]; 1129 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
1122 step1[1] = step2[1] + step2[2]; 1130 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
1123 step1[2] = step2[1] - step2[2]; 1131 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
1124 step1[3] = step2[0] - step2[3]; 1132 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
1125 step1[4] = step2[4]; 1133 step1[4] = step2[4];
1126 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1134 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1127 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1135 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1128 step1[5] = dct_const_round_shift(temp1); 1136 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1129 step1[6] = dct_const_round_shift(temp2); 1137 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1130 step1[7] = step2[7]; 1138 step1[7] = step2[7];
1131 1139
1132 step1[8] = step2[8] + step2[11]; 1140 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
1133 step1[9] = step2[9] + step2[10]; 1141 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
1134 step1[10] = step2[9] - step2[10]; 1142 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1135 step1[11] = step2[8] - step2[11]; 1143 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1136 step1[12] = -step2[12] + step2[15]; 1144 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1137 step1[13] = -step2[13] + step2[14]; 1145 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1138 step1[14] = step2[13] + step2[14]; 1146 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1139 step1[15] = step2[12] + step2[15]; 1147 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1140 1148
1141 step1[16] = step2[16]; 1149 step1[16] = step2[16];
1142 step1[17] = step2[17]; 1150 step1[17] = step2[17];
1143 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1151 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1144 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1152 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1145 step1[18] = dct_const_round_shift(temp1); 1153 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1146 step1[29] = dct_const_round_shift(temp2); 1154 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1147 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1155 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1148 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1156 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1149 step1[19] = dct_const_round_shift(temp1); 1157 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1150 step1[28] = dct_const_round_shift(temp2); 1158 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1151 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1159 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1152 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1160 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1153 step1[20] = dct_const_round_shift(temp1); 1161 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1154 step1[27] = dct_const_round_shift(temp2); 1162 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1155 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1163 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1156 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1164 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1157 step1[21] = dct_const_round_shift(temp1); 1165 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1158 step1[26] = dct_const_round_shift(temp2); 1166 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1159 step1[22] = step2[22]; 1167 step1[22] = step2[22];
1160 step1[23] = step2[23]; 1168 step1[23] = step2[23];
1161 step1[24] = step2[24]; 1169 step1[24] = step2[24];
1162 step1[25] = step2[25]; 1170 step1[25] = step2[25];
1163 step1[30] = step2[30]; 1171 step1[30] = step2[30];
1164 step1[31] = step2[31]; 1172 step1[31] = step2[31];
1165 1173
1166 // stage 6 1174 // stage 6
1167 step2[0] = step1[0] + step1[7]; 1175 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1168 step2[1] = step1[1] + step1[6]; 1176 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1169 step2[2] = step1[2] + step1[5]; 1177 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1170 step2[3] = step1[3] + step1[4]; 1178 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1171 step2[4] = step1[3] - step1[4]; 1179 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1172 step2[5] = step1[2] - step1[5]; 1180 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1173 step2[6] = step1[1] - step1[6]; 1181 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1174 step2[7] = step1[0] - step1[7]; 1182 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1175 step2[8] = step1[8]; 1183 step2[8] = step1[8];
1176 step2[9] = step1[9]; 1184 step2[9] = step1[9];
1177 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1185 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1178 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1186 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1179 step2[10] = dct_const_round_shift(temp1); 1187 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1180 step2[13] = dct_const_round_shift(temp2); 1188 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1181 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1189 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1182 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1190 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1183 step2[11] = dct_const_round_shift(temp1); 1191 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1184 step2[12] = dct_const_round_shift(temp2); 1192 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1185 step2[14] = step1[14]; 1193 step2[14] = step1[14];
1186 step2[15] = step1[15]; 1194 step2[15] = step1[15];
1187 1195
1188 step2[16] = step1[16] + step1[23]; 1196 step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1189 step2[17] = step1[17] + step1[22]; 1197 step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1190 step2[18] = step1[18] + step1[21]; 1198 step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1191 step2[19] = step1[19] + step1[20]; 1199 step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1192 step2[20] = step1[19] - step1[20]; 1200 step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1193 step2[21] = step1[18] - step1[21]; 1201 step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1194 step2[22] = step1[17] - step1[22]; 1202 step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1195 step2[23] = step1[16] - step1[23]; 1203 step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1196 1204
1197 step2[24] = -step1[24] + step1[31]; 1205 step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1198 step2[25] = -step1[25] + step1[30]; 1206 step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1199 step2[26] = -step1[26] + step1[29]; 1207 step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1200 step2[27] = -step1[27] + step1[28]; 1208 step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1201 step2[28] = step1[27] + step1[28]; 1209 step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1202 step2[29] = step1[26] + step1[29]; 1210 step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1203 step2[30] = step1[25] + step1[30]; 1211 step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1204 step2[31] = step1[24] + step1[31]; 1212 step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1205 1213
1206 // stage 7 1214 // stage 7
1207 step1[0] = step2[0] + step2[15]; 1215 step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1208 step1[1] = step2[1] + step2[14]; 1216 step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1209 step1[2] = step2[2] + step2[13]; 1217 step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1210 step1[3] = step2[3] + step2[12]; 1218 step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1211 step1[4] = step2[4] + step2[11]; 1219 step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1212 step1[5] = step2[5] + step2[10]; 1220 step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1213 step1[6] = step2[6] + step2[9]; 1221 step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1214 step1[7] = step2[7] + step2[8]; 1222 step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1215 step1[8] = step2[7] - step2[8]; 1223 step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1216 step1[9] = step2[6] - step2[9]; 1224 step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1217 step1[10] = step2[5] - step2[10]; 1225 step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1218 step1[11] = step2[4] - step2[11]; 1226 step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1219 step1[12] = step2[3] - step2[12]; 1227 step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1220 step1[13] = step2[2] - step2[13]; 1228 step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1221 step1[14] = step2[1] - step2[14]; 1229 step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1222 step1[15] = step2[0] - step2[15]; 1230 step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1223 1231
1224 step1[16] = step2[16]; 1232 step1[16] = step2[16];
1225 step1[17] = step2[17]; 1233 step1[17] = step2[17];
1226 step1[18] = step2[18]; 1234 step1[18] = step2[18];
1227 step1[19] = step2[19]; 1235 step1[19] = step2[19];
1228 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1236 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1229 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1237 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1230 step1[20] = dct_const_round_shift(temp1); 1238 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1231 step1[27] = dct_const_round_shift(temp2); 1239 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1232 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1240 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1233 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1241 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1234 step1[21] = dct_const_round_shift(temp1); 1242 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1235 step1[26] = dct_const_round_shift(temp2); 1243 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1236 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1244 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1237 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1245 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1238 step1[22] = dct_const_round_shift(temp1); 1246 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1239 step1[25] = dct_const_round_shift(temp2); 1247 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1240 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1248 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1241 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1249 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1242 step1[23] = dct_const_round_shift(temp1); 1250 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1243 step1[24] = dct_const_round_shift(temp2); 1251 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1244 step1[28] = step2[28]; 1252 step1[28] = step2[28];
1245 step1[29] = step2[29]; 1253 step1[29] = step2[29];
1246 step1[30] = step2[30]; 1254 step1[30] = step2[30];
1247 step1[31] = step2[31]; 1255 step1[31] = step2[31];
1248 1256
1249 // final stage 1257 // final stage
1250 output[0] = step1[0] + step1[31]; 1258 output[0] = WRAPLOW(step1[0] + step1[31], 8);
1251 output[1] = step1[1] + step1[30]; 1259 output[1] = WRAPLOW(step1[1] + step1[30], 8);
1252 output[2] = step1[2] + step1[29]; 1260 output[2] = WRAPLOW(step1[2] + step1[29], 8);
1253 output[3] = step1[3] + step1[28]; 1261 output[3] = WRAPLOW(step1[3] + step1[28], 8);
1254 output[4] = step1[4] + step1[27]; 1262 output[4] = WRAPLOW(step1[4] + step1[27], 8);
1255 output[5] = step1[5] + step1[26]; 1263 output[5] = WRAPLOW(step1[5] + step1[26], 8);
1256 output[6] = step1[6] + step1[25]; 1264 output[6] = WRAPLOW(step1[6] + step1[25], 8);
1257 output[7] = step1[7] + step1[24]; 1265 output[7] = WRAPLOW(step1[7] + step1[24], 8);
1258 output[8] = step1[8] + step1[23]; 1266 output[8] = WRAPLOW(step1[8] + step1[23], 8);
1259 output[9] = step1[9] + step1[22]; 1267 output[9] = WRAPLOW(step1[9] + step1[22], 8);
1260 output[10] = step1[10] + step1[21]; 1268 output[10] = WRAPLOW(step1[10] + step1[21], 8);
1261 output[11] = step1[11] + step1[20]; 1269 output[11] = WRAPLOW(step1[11] + step1[20], 8);
1262 output[12] = step1[12] + step1[19]; 1270 output[12] = WRAPLOW(step1[12] + step1[19], 8);
1263 output[13] = step1[13] + step1[18]; 1271 output[13] = WRAPLOW(step1[13] + step1[18], 8);
1264 output[14] = step1[14] + step1[17]; 1272 output[14] = WRAPLOW(step1[14] + step1[17], 8);
1265 output[15] = step1[15] + step1[16]; 1273 output[15] = WRAPLOW(step1[15] + step1[16], 8);
1266 output[16] = step1[15] - step1[16]; 1274 output[16] = WRAPLOW(step1[15] - step1[16], 8);
1267 output[17] = step1[14] - step1[17]; 1275 output[17] = WRAPLOW(step1[14] - step1[17], 8);
1268 output[18] = step1[13] - step1[18]; 1276 output[18] = WRAPLOW(step1[13] - step1[18], 8);
1269 output[19] = step1[12] - step1[19]; 1277 output[19] = WRAPLOW(step1[12] - step1[19], 8);
1270 output[20] = step1[11] - step1[20]; 1278 output[20] = WRAPLOW(step1[11] - step1[20], 8);
1271 output[21] = step1[10] - step1[21]; 1279 output[21] = WRAPLOW(step1[10] - step1[21], 8);
1272 output[22] = step1[9] - step1[22]; 1280 output[22] = WRAPLOW(step1[9] - step1[22], 8);
1273 output[23] = step1[8] - step1[23]; 1281 output[23] = WRAPLOW(step1[8] - step1[23], 8);
1274 output[24] = step1[7] - step1[24]; 1282 output[24] = WRAPLOW(step1[7] - step1[24], 8);
1275 output[25] = step1[6] - step1[25]; 1283 output[25] = WRAPLOW(step1[6] - step1[25], 8);
1276 output[26] = step1[5] - step1[26]; 1284 output[26] = WRAPLOW(step1[5] - step1[26], 8);
1277 output[27] = step1[4] - step1[27]; 1285 output[27] = WRAPLOW(step1[4] - step1[27], 8);
1278 output[28] = step1[3] - step1[28]; 1286 output[28] = WRAPLOW(step1[3] - step1[28], 8);
1279 output[29] = step1[2] - step1[29]; 1287 output[29] = WRAPLOW(step1[2] - step1[29], 8);
1280 output[30] = step1[1] - step1[30]; 1288 output[30] = WRAPLOW(step1[1] - step1[30], 8);
1281 output[31] = step1[0] - step1[31]; 1289 output[31] = WRAPLOW(step1[0] - step1[31], 8);
1282 } 1290 }
1283 1291
1284 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1292 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1285 int stride) { 1293 int stride) {
1286 tran_low_t out[32 * 32]; 1294 tran_low_t out[32 * 32];
1287 tran_low_t *outptr = out; 1295 tran_low_t *outptr = out;
1288 int i, j; 1296 int i, j;
1289 tran_low_t temp_in[32], temp_out[32]; 1297 tran_low_t temp_in[32], temp_out[32];
1290 1298
1291 // Rows 1299 // Rows
(...skipping 14 matching lines...) Expand all
1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); 1314 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
1307 input += 32; 1315 input += 32;
1308 outptr += 32; 1316 outptr += 32;
1309 } 1317 }
1310 1318
1311 // Columns 1319 // Columns
1312 for (i = 0; i < 32; ++i) { 1320 for (i = 0; i < 32; ++i) {
1313 for (j = 0; j < 32; ++j) 1321 for (j = 0; j < 32; ++j)
1314 temp_in[j] = out[j * 32 + i]; 1322 temp_in[j] = out[j * 32 + i];
1315 idct32(temp_in, temp_out); 1323 idct32(temp_in, temp_out);
1316 for (j = 0; j < 32; ++j) 1324 for (j = 0; j < 32; ++j) {
1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1325 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1318 + dest[j * stride + i]); 1326 ROUND_POWER_OF_TWO(temp_out[j], 6));
1327 }
1319 } 1328 }
1320 } 1329 }
1321 1330
1322 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1331 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1323 int stride) { 1332 int stride) {
1324 tran_low_t out[32 * 32] = {0}; 1333 tran_low_t out[32 * 32] = {0};
1325 tran_low_t *outptr = out; 1334 tran_low_t *outptr = out;
1326 int i, j; 1335 int i, j;
1327 tran_low_t temp_in[32], temp_out[32]; 1336 tran_low_t temp_in[32], temp_out[32];
1328 1337
1329 // Rows 1338 // Rows
1330 // only upper-left 8x8 has non-zero coeff 1339 // only upper-left 8x8 has non-zero coeff
1331 for (i = 0; i < 8; ++i) { 1340 for (i = 0; i < 8; ++i) {
1332 idct32(input, outptr); 1341 idct32(input, outptr);
1333 input += 32; 1342 input += 32;
1334 outptr += 32; 1343 outptr += 32;
1335 } 1344 }
1336 1345
1337 // Columns 1346 // Columns
1338 for (i = 0; i < 32; ++i) { 1347 for (i = 0; i < 32; ++i) {
1339 for (j = 0; j < 32; ++j) 1348 for (j = 0; j < 32; ++j)
1340 temp_in[j] = out[j * 32 + i]; 1349 temp_in[j] = out[j * 32 + i];
1341 idct32(temp_in, temp_out); 1350 idct32(temp_in, temp_out);
1342 for (j = 0; j < 32; ++j) 1351 for (j = 0; j < 32; ++j) {
1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1352 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1344 + dest[j * stride + i]); 1353 ROUND_POWER_OF_TWO(temp_out[j], 6));
1354 }
1345 } 1355 }
1346 } 1356 }
1347 1357
1348 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1358 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1349 int i, j; 1359 int i, j;
1350 tran_high_t a1; 1360 tran_high_t a1;
1351 1361
1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 1362 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1353 out = dct_const_round_shift(out * cospi_16_64); 1363 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1354 a1 = ROUND_POWER_OF_TWO(out, 6); 1364 a1 = ROUND_POWER_OF_TWO(out, 6);
1355 1365
1356 for (j = 0; j < 32; ++j) { 1366 for (j = 0; j < 32; ++j) {
1357 for (i = 0; i < 32; ++i) 1367 for (i = 0; i < 32; ++i)
1358 dest[i] = clip_pixel(dest[i] + a1); 1368 dest[i] = clip_pixel_add(dest[i], a1);
1359 dest += stride; 1369 dest += stride;
1360 } 1370 }
1361 } 1371 }
1362 1372
1363 // idct 1373 // idct
1364 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 1374 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
1365 int eob) { 1375 int eob) {
1366 if (eob > 1) 1376 if (eob > 1)
1367 vp9_idct4x4_16_add(input, dest, stride); 1377 vp9_idct4x4_16_add(input, dest, stride);
1368 else 1378 else
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, 1451 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
1442 int stride, int eob) { 1452 int stride, int eob) {
1443 if (tx_type == DCT_DCT) { 1453 if (tx_type == DCT_DCT) {
1444 vp9_idct16x16_add(input, dest, stride, eob); 1454 vp9_idct16x16_add(input, dest, stride, eob);
1445 } else { 1455 } else {
1446 vp9_iht16x16_256_add(input, dest, stride, tx_type); 1456 vp9_iht16x16_256_add(input, dest, stride, tx_type);
1447 } 1457 }
1448 } 1458 }
1449 1459
1450 #if CONFIG_VP9_HIGHBITDEPTH 1460 #if CONFIG_VP9_HIGHBITDEPTH
1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1461 void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1452 int stride, int bd) { 1462 int stride, int bd) {
1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1463 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1454 0.5 shifts per pixel. */ 1464 0.5 shifts per pixel. */
1455 int i; 1465 int i;
1456 tran_low_t output[16]; 1466 tran_low_t output[16];
1457 tran_high_t a1, b1, c1, d1, e1; 1467 tran_high_t a1, b1, c1, d1, e1;
1458 const tran_low_t *ip = input; 1468 const tran_low_t *ip = input;
1459 tran_low_t *op = output; 1469 tran_low_t *op = output;
1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1470 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1461 1471
1462 for (i = 0; i < 4; i++) { 1472 for (i = 0; i < 4; i++) {
1463 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1473 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1464 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1474 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1465 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1475 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1466 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1476 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1467 a1 += c1; 1477 a1 += c1;
1468 d1 -= b1; 1478 d1 -= b1;
1469 e1 = (a1 - d1) >> 1; 1479 e1 = (a1 - d1) >> 1;
1470 b1 = e1 - b1; 1480 b1 = e1 - b1;
1471 c1 = e1 - c1; 1481 c1 = e1 - c1;
1472 a1 -= b1; 1482 a1 -= b1;
1473 d1 += c1; 1483 d1 += c1;
1474 op[0] = WRAPLOW(a1); 1484 op[0] = WRAPLOW(a1, bd);
1475 op[1] = WRAPLOW(b1); 1485 op[1] = WRAPLOW(b1, bd);
1476 op[2] = WRAPLOW(c1); 1486 op[2] = WRAPLOW(c1, bd);
1477 op[3] = WRAPLOW(d1); 1487 op[3] = WRAPLOW(d1, bd);
1478 ip += 4; 1488 ip += 4;
1479 op += 4; 1489 op += 4;
1480 } 1490 }
1481 1491
1482 ip = output; 1492 ip = output;
1483 for (i = 0; i < 4; i++) { 1493 for (i = 0; i < 4; i++) {
1484 a1 = ip[4 * 0]; 1494 a1 = ip[4 * 0];
1485 c1 = ip[4 * 1]; 1495 c1 = ip[4 * 1];
1486 d1 = ip[4 * 2]; 1496 d1 = ip[4 * 2];
1487 b1 = ip[4 * 3]; 1497 b1 = ip[4 * 3];
1488 a1 += c1; 1498 a1 += c1;
1489 d1 -= b1; 1499 d1 -= b1;
1490 e1 = (a1 - d1) >> 1; 1500 e1 = (a1 - d1) >> 1;
1491 b1 = e1 - b1; 1501 b1 = e1 - b1;
1492 c1 = e1 - c1; 1502 c1 = e1 - c1;
1493 a1 -= b1; 1503 a1 -= b1;
1494 d1 += c1; 1504 d1 += c1;
1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd); 1505 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd); 1506 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd); 1507 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd); 1508 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1499 1509
1500 ip++; 1510 ip++;
1501 dest++; 1511 dest++;
1502 } 1512 }
1503 } 1513 }
1504 1514
1505 static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) { 1515 void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1506 tran_low_t step[4]; 1516 int dest_stride, int bd) {
1507 tran_high_t temp1, temp2;
1508 (void) bd;
1509 // stage 1
1510 temp1 = (input[0] + input[2]) * cospi_16_64;
1511 temp2 = (input[0] - input[2]) * cospi_16_64;
1512 step[0] = WRAPLOW(dct_const_round_shift(temp1));
1513 step[1] = WRAPLOW(dct_const_round_shift(temp2));
1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1516 step[2] = WRAPLOW(dct_const_round_shift(temp1));
1517 step[3] = WRAPLOW(dct_const_round_shift(temp2));
1518
1519 // stage 2
1520 output[0] = WRAPLOW(step[0] + step[3]);
1521 output[1] = WRAPLOW(step[1] + step[2]);
1522 output[2] = WRAPLOW(step[1] - step[2]);
1523 output[3] = WRAPLOW(step[0] - step[3]);
1524 }
1525
1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1527 int dest_stride, int bd) {
1528 int i; 1517 int i;
1529 tran_high_t a1, e1; 1518 tran_high_t a1, e1;
1530 tran_low_t tmp[4]; 1519 tran_low_t tmp[4];
1531 const tran_low_t *ip = in; 1520 const tran_low_t *ip = in;
1532 tran_low_t *op = tmp; 1521 tran_low_t *op = tmp;
1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1522 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1534 (void) bd; 1523 (void) bd;
1535 1524
1536 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1525 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1537 e1 = a1 >> 1; 1526 e1 = a1 >> 1;
1538 a1 -= e1; 1527 a1 -= e1;
1539 op[0] = WRAPLOW(a1); 1528 op[0] = WRAPLOW(a1, bd);
1540 op[1] = op[2] = op[3] = WRAPLOW(e1); 1529 op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1541 1530
1542 ip = tmp; 1531 ip = tmp;
1543 for (i = 0; i < 4; i++) { 1532 for (i = 0; i < 4; i++) {
1544 e1 = ip[0] >> 1; 1533 e1 = ip[0] >> 1;
1545 a1 = ip[0] - e1; 1534 a1 = ip[0] - e1;
1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd); 1535 dest[dest_stride * 0] = highbd_clip_pixel_add(
1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd); 1536 dest[dest_stride * 0], a1, bd);
1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd); 1537 dest[dest_stride * 1] = highbd_clip_pixel_add(
1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd); 1538 dest[dest_stride * 1], e1, bd);
1539 dest[dest_stride * 2] = highbd_clip_pixel_add(
1540 dest[dest_stride * 2], e1, bd);
1541 dest[dest_stride * 3] = highbd_clip_pixel_add(
1542 dest[dest_stride * 3], e1, bd);
1550 ip++; 1543 ip++;
1551 dest++; 1544 dest++;
1552 } 1545 }
1553 } 1546 }
1554 1547
1555 void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1548 static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
1556 int stride, int bd) { 1549 tran_low_t step[4];
1550 tran_high_t temp1, temp2;
1551 (void) bd;
1552 // stage 1
1553 temp1 = (input[0] + input[2]) * cospi_16_64;
1554 temp2 = (input[0] - input[2]) * cospi_16_64;
1555 step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
1556 step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
1557 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1558 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1559 step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
1560 step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
1561
1562 // stage 2
1563 output[0] = WRAPLOW(step[0] + step[3], bd);
1564 output[1] = WRAPLOW(step[1] + step[2], bd);
1565 output[2] = WRAPLOW(step[1] - step[2], bd);
1566 output[3] = WRAPLOW(step[0] - step[3], bd);
1567 }
1568
1569 void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1570 int stride, int bd) {
1557 tran_low_t out[4 * 4]; 1571 tran_low_t out[4 * 4];
1558 tran_low_t *outptr = out; 1572 tran_low_t *outptr = out;
1559 int i, j; 1573 int i, j;
1560 tran_low_t temp_in[4], temp_out[4]; 1574 tran_low_t temp_in[4], temp_out[4];
1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1575 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1562 1576
1563 // Rows 1577 // Rows
1564 for (i = 0; i < 4; ++i) { 1578 for (i = 0; i < 4; ++i) {
1565 high_idct4(input, outptr, bd); 1579 highbd_idct4(input, outptr, bd);
1566 input += 4; 1580 input += 4;
1567 outptr += 4; 1581 outptr += 4;
1568 } 1582 }
1569 1583
1570 // Columns 1584 // Columns
1571 for (i = 0; i < 4; ++i) { 1585 for (i = 0; i < 4; ++i) {
1572 for (j = 0; j < 4; ++j) 1586 for (j = 0; j < 4; ++j)
1573 temp_in[j] = out[j * 4 + i]; 1587 temp_in[j] = out[j * 4 + i];
1574 high_idct4(temp_in, temp_out, bd); 1588 highbd_idct4(temp_in, temp_out, bd);
1575 for (j = 0; j < 4; ++j) 1589 for (j = 0; j < 4; ++j) {
1576 dest[j * stride + i] = clip_pixel_bd_high( 1590 dest[j * stride + i] = highbd_clip_pixel_add(
1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1591 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1592 }
1578 } 1593 }
1579 } 1594 }
1580 1595
1581 void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, 1596 void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1582 int dest_stride, int bd) { 1597 int dest_stride, int bd) {
1583 int i; 1598 int i;
1584 tran_high_t a1; 1599 tran_high_t a1;
1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 1600 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1601 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1587 1602
1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1603 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1589 a1 = ROUND_POWER_OF_TWO(out, 4); 1604 a1 = ROUND_POWER_OF_TWO(out, 4);
1590 1605
1591 for (i = 0; i < 4; i++) { 1606 for (i = 0; i < 4; i++) {
1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd); 1607 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd); 1608 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd); 1609 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd); 1610 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1596 dest += dest_stride; 1611 dest += dest_stride;
1597 } 1612 }
1598 } 1613 }
1599 1614
1600 static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { 1615 static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
1601 tran_low_t step1[8], step2[8]; 1616 tran_low_t step1[8], step2[8];
1602 tran_high_t temp1, temp2; 1617 tran_high_t temp1, temp2;
1603 // stage 1 1618 // stage 1
1604 step1[0] = input[0]; 1619 step1[0] = input[0];
1605 step1[2] = input[4]; 1620 step1[2] = input[4];
1606 step1[1] = input[2]; 1621 step1[1] = input[2];
1607 step1[3] = input[6]; 1622 step1[3] = input[6];
1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1623 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1624 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 1625 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 1626 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1627 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1628 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1629 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1630 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1616 1631
1617 // stage 2 & stage 3 - even half 1632 // stage 2 & stage 3 - even half
1618 high_idct4(step1, step1, bd); 1633 highbd_idct4(step1, step1, bd);
1619 1634
1620 // stage 2 - odd half 1635 // stage 2 - odd half
1621 step2[4] = WRAPLOW(step1[4] + step1[5]); 1636 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1622 step2[5] = WRAPLOW(step1[4] - step1[5]); 1637 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1623 step2[6] = WRAPLOW(-step1[6] + step1[7]); 1638 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1624 step2[7] = WRAPLOW(step1[6] + step1[7]); 1639 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1625 1640
1626 // stage 3 - odd half 1641 // stage 3 - odd half
1627 step1[4] = step2[4]; 1642 step1[4] = step2[4];
1628 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1643 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1629 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1644 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1645 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1646 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1632 step1[7] = step2[7]; 1647 step1[7] = step2[7];
1633 1648
1634 // stage 4 1649 // stage 4
1635 output[0] = WRAPLOW(step1[0] + step1[7]); 1650 output[0] = WRAPLOW(step1[0] + step1[7], bd);
1636 output[1] = WRAPLOW(step1[1] + step1[6]); 1651 output[1] = WRAPLOW(step1[1] + step1[6], bd);
1637 output[2] = WRAPLOW(step1[2] + step1[5]); 1652 output[2] = WRAPLOW(step1[2] + step1[5], bd);
1638 output[3] = WRAPLOW(step1[3] + step1[4]); 1653 output[3] = WRAPLOW(step1[3] + step1[4], bd);
1639 output[4] = WRAPLOW(step1[3] - step1[4]); 1654 output[4] = WRAPLOW(step1[3] - step1[4], bd);
1640 output[5] = WRAPLOW(step1[2] - step1[5]); 1655 output[5] = WRAPLOW(step1[2] - step1[5], bd);
1641 output[6] = WRAPLOW(step1[1] - step1[6]); 1656 output[6] = WRAPLOW(step1[1] - step1[6], bd);
1642 output[7] = WRAPLOW(step1[0] - step1[7]); 1657 output[7] = WRAPLOW(step1[0] - step1[7], bd);
1643 } 1658 }
1644 1659
1645 void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1660 void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1646 int stride, int bd) { 1661 int stride, int bd) {
1647 tran_low_t out[8 * 8]; 1662 tran_low_t out[8 * 8];
1648 tran_low_t *outptr = out; 1663 tran_low_t *outptr = out;
1649 int i, j; 1664 int i, j;
1650 tran_low_t temp_in[8], temp_out[8]; 1665 tran_low_t temp_in[8], temp_out[8];
1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1666 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1652 1667
1653 // First transform rows. 1668 // First transform rows.
1654 for (i = 0; i < 8; ++i) { 1669 for (i = 0; i < 8; ++i) {
1655 high_idct8(input, outptr, bd); 1670 highbd_idct8(input, outptr, bd);
1656 input += 8; 1671 input += 8;
1657 outptr += 8; 1672 outptr += 8;
1658 } 1673 }
1659 1674
1660 // Then transform columns. 1675 // Then transform columns.
1661 for (i = 0; i < 8; ++i) { 1676 for (i = 0; i < 8; ++i) {
1662 for (j = 0; j < 8; ++j) 1677 for (j = 0; j < 8; ++j)
1663 temp_in[j] = out[j * 8 + i]; 1678 temp_in[j] = out[j * 8 + i];
1664 high_idct8(temp_in, temp_out, bd); 1679 highbd_idct8(temp_in, temp_out, bd);
1665 for (j = 0; j < 8; ++j) 1680 for (j = 0; j < 8; ++j) {
1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i], 1681 dest[j * stride + i] = highbd_clip_pixel_add(
1667 ROUND_POWER_OF_TWO(temp_out[j], 5), 1682 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1668 bd); 1683 }
1669 } 1684 }
1670 } 1685 }
1671 1686
1672 void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, 1687 void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1673 int stride, int bd) { 1688 int stride, int bd) {
1674 int i, j; 1689 int i, j;
1675 tran_high_t a1; 1690 tran_high_t a1;
1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 1691 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1692 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1693 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1679 a1 = ROUND_POWER_OF_TWO(out, 5); 1694 a1 = ROUND_POWER_OF_TWO(out, 5);
1680 for (j = 0; j < 8; ++j) { 1695 for (j = 0; j < 8; ++j) {
1681 for (i = 0; i < 8; ++i) 1696 for (i = 0; i < 8; ++i)
1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 1697 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1683 dest += stride; 1698 dest += stride;
1684 } 1699 }
1685 } 1700 }
1686 1701
1687 static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { 1702 static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1703 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1689 1704
1690 tran_high_t x0 = input[0]; 1705 tran_high_t x0 = input[0];
1691 tran_high_t x1 = input[1]; 1706 tran_high_t x1 = input[1];
1692 tran_high_t x2 = input[2]; 1707 tran_high_t x2 = input[2];
1693 tran_high_t x3 = input[3]; 1708 tran_high_t x3 = input[3];
1694 (void) bd; 1709 (void) bd;
1695 1710
1696 if (!(x0 | x1 | x2 | x3)) { 1711 if (!(x0 | x1 | x2 | x3)) {
1697 vpx_memset(output, 0, 4 * sizeof(*output)); 1712 vpx_memset(output, 0, 4 * sizeof(*output));
(...skipping 16 matching lines...) Expand all
1714 1729
1715 s0 = x0 + x3; 1730 s0 = x0 + x3;
1716 s1 = x1 + x3; 1731 s1 = x1 + x3;
1717 s2 = x2; 1732 s2 = x2;
1718 s3 = x0 + x1 - x3; 1733 s3 = x0 + x1 - x3;
1719 1734
1720 // 1-D transform scaling factor is sqrt(2). 1735 // 1-D transform scaling factor is sqrt(2).
1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1736 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1722 // + 1b (addition) = 29b. 1737 // + 1b (addition) = 29b.
1723 // Hence the output bit depth is 15b. 1738 // Hence the output bit depth is 15b.
1724 output[0] = WRAPLOW(dct_const_round_shift(s0)); 1739 output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
1725 output[1] = WRAPLOW(dct_const_round_shift(s1)); 1740 output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
1726 output[2] = WRAPLOW(dct_const_round_shift(s2)); 1741 output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
1727 output[3] = WRAPLOW(dct_const_round_shift(s3)); 1742 output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
1728 } 1743 }
1729 1744
1730 void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1745 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1731 int stride, int tx_type, int bd) { 1746 int stride, int tx_type, int bd) {
1732 const high_transform_2d IHT_4[] = { 1747 const highbd_transform_2d IHT_4[] = {
1733 { high_idct4, high_idct4 }, // DCT_DCT = 0 1748 { highbd_idct4, highbd_idct4 }, // DCT_DCT = 0
1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1 1749 { highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1
1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2 1750 { highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3 1751 { highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3
1737 }; 1752 };
1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1753 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1739 1754
1740 int i, j; 1755 int i, j;
1741 tran_low_t out[4 * 4]; 1756 tran_low_t out[4 * 4];
1742 tran_low_t *outptr = out; 1757 tran_low_t *outptr = out;
1743 tran_low_t temp_in[4], temp_out[4]; 1758 tran_low_t temp_in[4], temp_out[4];
1744 1759
1745 // Inverse transform row vectors. 1760 // Inverse transform row vectors.
1746 for (i = 0; i < 4; ++i) { 1761 for (i = 0; i < 4; ++i) {
1747 IHT_4[tx_type].rows(input, outptr, bd); 1762 IHT_4[tx_type].rows(input, outptr, bd);
1748 input += 4; 1763 input += 4;
1749 outptr += 4; 1764 outptr += 4;
1750 } 1765 }
1751 1766
1752 // Inverse transform column vectors. 1767 // Inverse transform column vectors.
1753 for (i = 0; i < 4; ++i) { 1768 for (i = 0; i < 4; ++i) {
1754 for (j = 0; j < 4; ++j) 1769 for (j = 0; j < 4; ++j)
1755 temp_in[j] = out[j * 4 + i]; 1770 temp_in[j] = out[j * 4 + i];
1756 IHT_4[tx_type].cols(temp_in, temp_out, bd); 1771 IHT_4[tx_type].cols(temp_in, temp_out, bd);
1757 for (j = 0; j < 4; ++j) 1772 for (j = 0; j < 4; ++j) {
1758 dest[j * stride + i] = clip_pixel_bd_high( 1773 dest[j * stride + i] = highbd_clip_pixel_add(
1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1774 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1775 }
1760 } 1776 }
1761 } 1777 }
1762 1778
1763 static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { 1779 static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1780 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1765 1781
1766 tran_high_t x0 = input[7]; 1782 tran_high_t x0 = input[7];
1767 tran_high_t x1 = input[0]; 1783 tran_high_t x1 = input[0];
1768 tran_high_t x2 = input[5]; 1784 tran_high_t x2 = input[5];
1769 tran_high_t x3 = input[2]; 1785 tran_high_t x3 = input[2];
1770 tran_high_t x4 = input[3]; 1786 tran_high_t x4 = input[3];
1771 tran_high_t x5 = input[4]; 1787 tran_high_t x5 = input[4];
1772 tran_high_t x6 = input[1]; 1788 tran_high_t x6 = input[1];
1773 tran_high_t x7 = input[6]; 1789 tran_high_t x7 = input[6];
1774 (void) bd; 1790 (void) bd;
1775 1791
1776 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1792 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1777 vpx_memset(output, 0, 8 * sizeof(*output)); 1793 vpx_memset(output, 0, 8 * sizeof(*output));
1778 return; 1794 return;
1779 } 1795 }
1780 1796
1781 // stage 1 1797 // stage 1
1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1798 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1799 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1800 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1801 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1802 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1803 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1804 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1805 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1790 1806
1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); 1807 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); 1808 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); 1809 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); 1810 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); 1811 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); 1812 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); 1813 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); 1814 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1799 1815
1800 // stage 2 1816 // stage 2
1801 s0 = x0; 1817 s0 = x0;
1802 s1 = x1; 1818 s1 = x1;
1803 s2 = x2; 1819 s2 = x2;
1804 s3 = x3; 1820 s3 = x3;
1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1821 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1822 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1823 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1824 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1809 1825
1810 x0 = s0 + s2; 1826 x0 = WRAPLOW(s0 + s2, bd);
1811 x1 = s1 + s3; 1827 x1 = WRAPLOW(s1 + s3, bd);
1812 x2 = s0 - s2; 1828 x2 = WRAPLOW(s0 - s2, bd);
1813 x3 = s1 - s3; 1829 x3 = WRAPLOW(s1 - s3, bd);
1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 1830 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 1831 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 1832 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 1833 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1818 1834
1819 // stage 3 1835 // stage 3
1820 s2 = cospi_16_64 * (x2 + x3); 1836 s2 = cospi_16_64 * (x2 + x3);
1821 s3 = cospi_16_64 * (x2 - x3); 1837 s3 = cospi_16_64 * (x2 - x3);
1822 s6 = cospi_16_64 * (x6 + x7); 1838 s6 = cospi_16_64 * (x6 + x7);
1823 s7 = cospi_16_64 * (x6 - x7); 1839 s7 = cospi_16_64 * (x6 - x7);
1824 1840
1825 x2 = WRAPLOW(dct_const_round_shift(s2)); 1841 x2 = WRAPLOW(dct_const_round_shift(s2), bd);
1826 x3 = WRAPLOW(dct_const_round_shift(s3)); 1842 x3 = WRAPLOW(dct_const_round_shift(s3), bd);
1827 x6 = WRAPLOW(dct_const_round_shift(s6)); 1843 x6 = WRAPLOW(dct_const_round_shift(s6), bd);
1828 x7 = WRAPLOW(dct_const_round_shift(s7)); 1844 x7 = WRAPLOW(dct_const_round_shift(s7), bd);
1829 1845
1830 output[0] = WRAPLOW(x0); 1846 output[0] = WRAPLOW(x0, bd);
1831 output[1] = WRAPLOW(-x4); 1847 output[1] = WRAPLOW(-x4, bd);
1832 output[2] = WRAPLOW(x6); 1848 output[2] = WRAPLOW(x6, bd);
1833 output[3] = WRAPLOW(-x2); 1849 output[3] = WRAPLOW(-x2, bd);
1834 output[4] = WRAPLOW(x3); 1850 output[4] = WRAPLOW(x3, bd);
1835 output[5] = WRAPLOW(-x7); 1851 output[5] = WRAPLOW(-x7, bd);
1836 output[6] = WRAPLOW(x5); 1852 output[6] = WRAPLOW(x5, bd);
1837 output[7] = WRAPLOW(-x1); 1853 output[7] = WRAPLOW(-x1, bd);
1838 } 1854 }
1839 1855
1840 static const high_transform_2d HIGH_IHT_8[] = { 1856 static const highbd_transform_2d HIGH_IHT_8[] = {
1841 { high_idct8, high_idct8 }, // DCT_DCT = 0 1857 { highbd_idct8, highbd_idct8 }, // DCT_DCT = 0
1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1 1858 { highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1
1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2 1859 { highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3 1860 { highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3
1845 }; 1861 };
1846 1862
1847 void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1863 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1848 int stride, int tx_type, int bd) { 1864 int stride, int tx_type, int bd) {
1849 int i, j; 1865 int i, j;
1850 tran_low_t out[8 * 8]; 1866 tran_low_t out[8 * 8];
1851 tran_low_t *outptr = out; 1867 tran_low_t *outptr = out;
1852 tran_low_t temp_in[8], temp_out[8]; 1868 tran_low_t temp_in[8], temp_out[8];
1853 const high_transform_2d ht = HIGH_IHT_8[tx_type]; 1869 const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1870 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1855 1871
1856 // Inverse transform row vectors. 1872 // Inverse transform row vectors.
1857 for (i = 0; i < 8; ++i) { 1873 for (i = 0; i < 8; ++i) {
1858 ht.rows(input, outptr, bd); 1874 ht.rows(input, outptr, bd);
1859 input += 8; 1875 input += 8;
1860 outptr += 8; 1876 outptr += 8;
1861 } 1877 }
1862 1878
1863 // Inverse transform column vectors. 1879 // Inverse transform column vectors.
1864 for (i = 0; i < 8; ++i) { 1880 for (i = 0; i < 8; ++i) {
1865 for (j = 0; j < 8; ++j) 1881 for (j = 0; j < 8; ++j)
1866 temp_in[j] = out[j * 8 + i]; 1882 temp_in[j] = out[j * 8 + i];
1867 ht.cols(temp_in, temp_out, bd); 1883 ht.cols(temp_in, temp_out, bd);
1868 for (j = 0; j < 8; ++j) 1884 for (j = 0; j < 8; ++j) {
1869 dest[j * stride + i] = clip_pixel_bd_high( 1885 dest[j * stride + i] = highbd_clip_pixel_add(
1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1886 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1887 }
1871 } 1888 }
1872 } 1889 }
1873 1890
1874 void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, 1891 void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1875 int stride, int bd) { 1892 int stride, int bd) {
1876 tran_low_t out[8 * 8] = { 0 }; 1893 tran_low_t out[8 * 8] = { 0 };
1877 tran_low_t *outptr = out; 1894 tran_low_t *outptr = out;
1878 int i, j; 1895 int i, j;
1879 tran_low_t temp_in[8], temp_out[8]; 1896 tran_low_t temp_in[8], temp_out[8];
1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1897 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1881 1898
1882 // First transform rows. 1899 // First transform rows.
1883 // Only first 4 row has non-zero coefs. 1900 // Only first 4 row has non-zero coefs.
1884 for (i = 0; i < 4; ++i) { 1901 for (i = 0; i < 4; ++i) {
1885 high_idct8(input, outptr, bd); 1902 highbd_idct8(input, outptr, bd);
1886 input += 8; 1903 input += 8;
1887 outptr += 8; 1904 outptr += 8;
1888 } 1905 }
1889 // Then transform columns. 1906 // Then transform columns.
1890 for (i = 0; i < 8; ++i) { 1907 for (i = 0; i < 8; ++i) {
1891 for (j = 0; j < 8; ++j) 1908 for (j = 0; j < 8; ++j)
1892 temp_in[j] = out[j * 8 + i]; 1909 temp_in[j] = out[j * 8 + i];
1893 high_idct8(temp_in, temp_out, bd); 1910 highbd_idct8(temp_in, temp_out, bd);
1894 for (j = 0; j < 8; ++j) 1911 for (j = 0; j < 8; ++j) {
1895 dest[j * stride + i] = clip_pixel_bd_high( 1912 dest[j * stride + i] = highbd_clip_pixel_add(
1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1913 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1914 }
1897 } 1915 }
1898 } 1916 }
1899 1917
1900 static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { 1918 static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
1901 tran_low_t step1[16], step2[16]; 1919 tran_low_t step1[16], step2[16];
1902 tran_high_t temp1, temp2; 1920 tran_high_t temp1, temp2;
1903 (void) bd; 1921 (void) bd;
1904 1922
1905 // stage 1 1923 // stage 1
1906 step1[0] = input[0/2]; 1924 step1[0] = input[0/2];
1907 step1[1] = input[16/2]; 1925 step1[1] = input[16/2];
1908 step1[2] = input[8/2]; 1926 step1[2] = input[8/2];
1909 step1[3] = input[24/2]; 1927 step1[3] = input[24/2];
1910 step1[4] = input[4/2]; 1928 step1[4] = input[4/2];
(...skipping 14 matching lines...) Expand all
1925 step2[1] = step1[1]; 1943 step2[1] = step1[1];
1926 step2[2] = step1[2]; 1944 step2[2] = step1[2];
1927 step2[3] = step1[3]; 1945 step2[3] = step1[3];
1928 step2[4] = step1[4]; 1946 step2[4] = step1[4];
1929 step2[5] = step1[5]; 1947 step2[5] = step1[5];
1930 step2[6] = step1[6]; 1948 step2[6] = step1[6];
1931 step2[7] = step1[7]; 1949 step2[7] = step1[7];
1932 1950
1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1951 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1952 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 1953 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 1954 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
1937 1955
1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1956 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1957 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 1958 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 1959 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
1942 1960
1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1961 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1962 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 1963 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 1964 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
1947 1965
1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1966 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1967 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 1968 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 1969 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
1952 1970
1953 // stage 3 1971 // stage 3
1954 step1[0] = step2[0]; 1972 step1[0] = step2[0];
1955 step1[1] = step2[1]; 1973 step1[1] = step2[1];
1956 step1[2] = step2[2]; 1974 step1[2] = step2[2];
1957 step1[3] = step2[3]; 1975 step1[3] = step2[3];
1958 1976
1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1977 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1978 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 1979 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 1980 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1981 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1982 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1983 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1984 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
1967 1985
1968 step1[8] = WRAPLOW(step2[8] + step2[9]); 1986 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1969 step1[9] = WRAPLOW(step2[8] - step2[9]); 1987 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1970 step1[10] = WRAPLOW(-step2[10] + step2[11]); 1988 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1971 step1[11] = WRAPLOW(step2[10] + step2[11]); 1989 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1972 step1[12] = WRAPLOW(step2[12] + step2[13]); 1990 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1973 step1[13] = WRAPLOW(step2[12] - step2[13]); 1991 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1974 step1[14] = WRAPLOW(-step2[14] + step2[15]); 1992 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1975 step1[15] = WRAPLOW(step2[14] + step2[15]); 1993 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1976 1994
1977 // stage 4 1995 // stage 4
1978 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1996 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1979 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1997 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 1998 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 1999 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2000 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2001 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 2002 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 2003 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
1986 step2[4] = WRAPLOW(step1[4] + step1[5]); 2004 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1987 step2[5] = WRAPLOW(step1[4] - step1[5]); 2005 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1988 step2[6] = WRAPLOW(-step1[6] + step1[7]); 2006 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1989 step2[7] = WRAPLOW(step1[6] + step1[7]); 2007 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1990 2008
1991 step2[8] = step1[8]; 2009 step2[8] = step1[8];
1992 step2[15] = step1[15]; 2010 step2[15] = step1[15];
1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2011 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2012 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 2013 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 2014 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2015 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2016 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2017 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2018 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2001 step2[11] = step1[11]; 2019 step2[11] = step1[11];
2002 step2[12] = step1[12]; 2020 step2[12] = step1[12];
2003 2021
2004 // stage 5 2022 // stage 5
2005 step1[0] = WRAPLOW(step2[0] + step2[3]); 2023 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2006 step1[1] = WRAPLOW(step2[1] + step2[2]); 2024 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2007 step1[2] = WRAPLOW(step2[1] - step2[2]); 2025 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2008 step1[3] = WRAPLOW(step2[0] - step2[3]); 2026 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2009 step1[4] = step2[4]; 2027 step1[4] = step2[4];
2010 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2028 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2011 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2029 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2030 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2031 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2014 step1[7] = step2[7]; 2032 step1[7] = step2[7];
2015 2033
2016 step1[8] = WRAPLOW(step2[8] + step2[11]); 2034 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2017 step1[9] = WRAPLOW(step2[9] + step2[10]); 2035 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2018 step1[10] = WRAPLOW(step2[9] - step2[10]); 2036 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2019 step1[11] = WRAPLOW(step2[8] - step2[11]); 2037 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2020 step1[12] = WRAPLOW(-step2[12] + step2[15]); 2038 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2021 step1[13] = WRAPLOW(-step2[13] + step2[14]); 2039 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2022 step1[14] = WRAPLOW(step2[13] + step2[14]); 2040 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2023 step1[15] = WRAPLOW(step2[12] + step2[15]); 2041 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2024 2042
2025 // stage 6 2043 // stage 6
2026 step2[0] = WRAPLOW(step1[0] + step1[7]); 2044 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2027 step2[1] = WRAPLOW(step1[1] + step1[6]); 2045 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2028 step2[2] = WRAPLOW(step1[2] + step1[5]); 2046 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2029 step2[3] = WRAPLOW(step1[3] + step1[4]); 2047 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2030 step2[4] = WRAPLOW(step1[3] - step1[4]); 2048 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2031 step2[5] = WRAPLOW(step1[2] - step1[5]); 2049 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2032 step2[6] = WRAPLOW(step1[1] - step1[6]); 2050 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2033 step2[7] = WRAPLOW(step1[0] - step1[7]); 2051 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2034 step2[8] = step1[8]; 2052 step2[8] = step1[8];
2035 step2[9] = step1[9]; 2053 step2[9] = step1[9];
2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2054 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2037 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2055 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2056 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2057 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2058 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2041 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2059 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2060 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2061 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2044 step2[14] = step1[14]; 2062 step2[14] = step1[14];
2045 step2[15] = step1[15]; 2063 step2[15] = step1[15];
2046 2064
2047 // stage 7 2065 // stage 7
2048 output[0] = WRAPLOW(step2[0] + step2[15]); 2066 output[0] = WRAPLOW(step2[0] + step2[15], bd);
2049 output[1] = WRAPLOW(step2[1] + step2[14]); 2067 output[1] = WRAPLOW(step2[1] + step2[14], bd);
2050 output[2] = WRAPLOW(step2[2] + step2[13]); 2068 output[2] = WRAPLOW(step2[2] + step2[13], bd);
2051 output[3] = WRAPLOW(step2[3] + step2[12]); 2069 output[3] = WRAPLOW(step2[3] + step2[12], bd);
2052 output[4] = WRAPLOW(step2[4] + step2[11]); 2070 output[4] = WRAPLOW(step2[4] + step2[11], bd);
2053 output[5] = WRAPLOW(step2[5] + step2[10]); 2071 output[5] = WRAPLOW(step2[5] + step2[10], bd);
2054 output[6] = WRAPLOW(step2[6] + step2[9]); 2072 output[6] = WRAPLOW(step2[6] + step2[9], bd);
2055 output[7] = WRAPLOW(step2[7] + step2[8]); 2073 output[7] = WRAPLOW(step2[7] + step2[8], bd);
2056 output[8] = WRAPLOW(step2[7] - step2[8]); 2074 output[8] = WRAPLOW(step2[7] - step2[8], bd);
2057 output[9] = WRAPLOW(step2[6] - step2[9]); 2075 output[9] = WRAPLOW(step2[6] - step2[9], bd);
2058 output[10] = WRAPLOW(step2[5] - step2[10]); 2076 output[10] = WRAPLOW(step2[5] - step2[10], bd);
2059 output[11] = WRAPLOW(step2[4] - step2[11]); 2077 output[11] = WRAPLOW(step2[4] - step2[11], bd);
2060 output[12] = WRAPLOW(step2[3] - step2[12]); 2078 output[12] = WRAPLOW(step2[3] - step2[12], bd);
2061 output[13] = WRAPLOW(step2[2] - step2[13]); 2079 output[13] = WRAPLOW(step2[2] - step2[13], bd);
2062 output[14] = WRAPLOW(step2[1] - step2[14]); 2080 output[14] = WRAPLOW(step2[1] - step2[14], bd);
2063 output[15] = WRAPLOW(step2[0] - step2[15]); 2081 output[15] = WRAPLOW(step2[0] - step2[15], bd);
2064 } 2082 }
2065 2083
2066 void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 2084 void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2067 int stride, int bd) { 2085 int stride, int bd) {
2068 tran_low_t out[16 * 16]; 2086 tran_low_t out[16 * 16];
2069 tran_low_t *outptr = out; 2087 tran_low_t *outptr = out;
2070 int i, j; 2088 int i, j;
2071 tran_low_t temp_in[16], temp_out[16]; 2089 tran_low_t temp_in[16], temp_out[16];
2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2090 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2073 2091
2074 // First transform rows. 2092 // First transform rows.
2075 for (i = 0; i < 16; ++i) { 2093 for (i = 0; i < 16; ++i) {
2076 high_idct16(input, outptr, bd); 2094 highbd_idct16(input, outptr, bd);
2077 input += 16; 2095 input += 16;
2078 outptr += 16; 2096 outptr += 16;
2079 } 2097 }
2080 2098
2081 // Then transform columns. 2099 // Then transform columns.
2082 for (i = 0; i < 16; ++i) { 2100 for (i = 0; i < 16; ++i) {
2083 for (j = 0; j < 16; ++j) 2101 for (j = 0; j < 16; ++j)
2084 temp_in[j] = out[j * 16 + i]; 2102 temp_in[j] = out[j * 16 + i];
2085 high_idct16(temp_in, temp_out, bd); 2103 highbd_idct16(temp_in, temp_out, bd);
2086 for (j = 0; j < 16; ++j) 2104 for (j = 0; j < 16; ++j) {
2087 dest[j * stride + i] = clip_pixel_bd_high( 2105 dest[j * stride + i] = highbd_clip_pixel_add(
2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2106 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2107 }
2089 } 2108 }
2090 } 2109 }
2091 2110
2092 static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { 2111 static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
2112 int bd) {
2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 2113 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
2094 tran_high_t s9, s10, s11, s12, s13, s14, s15; 2114 tran_high_t s9, s10, s11, s12, s13, s14, s15;
2095 2115
2096 tran_high_t x0 = input[15]; 2116 tran_high_t x0 = input[15];
2097 tran_high_t x1 = input[0]; 2117 tran_high_t x1 = input[0];
2098 tran_high_t x2 = input[13]; 2118 tran_high_t x2 = input[13];
2099 tran_high_t x3 = input[2]; 2119 tran_high_t x3 = input[2];
2100 tran_high_t x4 = input[11]; 2120 tran_high_t x4 = input[11];
2101 tran_high_t x5 = input[4]; 2121 tran_high_t x5 = input[4];
2102 tran_high_t x6 = input[9]; 2122 tran_high_t x6 = input[9];
(...skipping 25 matching lines...) Expand all
2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 2148 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 2149 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 2150 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 2151 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 2152 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 2153 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 2154 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 2155 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 2156 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
2137 2157
2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); 2158 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); 2159 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); 2160 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); 2161 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); 2162 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); 2163 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); 2164 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); 2165 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); 2166 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); 2167 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); 2168 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); 2169 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); 2170 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); 2171 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); 2172 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); 2173 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
2154 2174
2155 // stage 2 2175 // stage 2
2156 s0 = x0; 2176 s0 = x0;
2157 s1 = x1; 2177 s1 = x1;
2158 s2 = x2; 2178 s2 = x2;
2159 s3 = x3; 2179 s3 = x3;
2160 s4 = x4; 2180 s4 = x4;
2161 s5 = x5; 2181 s5 = x5;
2162 s6 = x6; 2182 s6 = x6;
2163 s7 = x7; 2183 s7 = x7;
2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 2184 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 2185 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 2186 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 2187 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 2188 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 2189 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 2190 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 2191 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
2172 2192
2173 x0 = WRAPLOW(s0 + s4); 2193 x0 = WRAPLOW(s0 + s4, bd);
2174 x1 = WRAPLOW(s1 + s5); 2194 x1 = WRAPLOW(s1 + s5, bd);
2175 x2 = WRAPLOW(s2 + s6); 2195 x2 = WRAPLOW(s2 + s6, bd);
2176 x3 = WRAPLOW(s3 + s7); 2196 x3 = WRAPLOW(s3 + s7, bd);
2177 x4 = WRAPLOW(s0 - s4); 2197 x4 = WRAPLOW(s0 - s4, bd);
2178 x5 = WRAPLOW(s1 - s5); 2198 x5 = WRAPLOW(s1 - s5, bd);
2179 x6 = WRAPLOW(s2 - s6); 2199 x6 = WRAPLOW(s2 - s6, bd);
2180 x7 = WRAPLOW(s3 - s7); 2200 x7 = WRAPLOW(s3 - s7, bd);
2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); 2201 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); 2202 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); 2203 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); 2204 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); 2205 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); 2206 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); 2207 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); 2208 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
2189 2209
2190 // stage 3 2210 // stage 3
2191 s0 = x0; 2211 s0 = x0;
2192 s1 = x1; 2212 s1 = x1;
2193 s2 = x2; 2213 s2 = x2;
2194 s3 = x3; 2214 s3 = x3;
2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 2215 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 2216 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 2217 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 2218 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
2199 s8 = x8; 2219 s8 = x8;
2200 s9 = x9; 2220 s9 = x9;
2201 s10 = x10; 2221 s10 = x10;
2202 s11 = x11; 2222 s11 = x11;
2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 2223 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 2224 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 2225 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 2226 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2207 2227
2208 x0 = WRAPLOW(s0 + s2); 2228 x0 = WRAPLOW(s0 + s2, bd);
2209 x1 = WRAPLOW(s1 + s3); 2229 x1 = WRAPLOW(s1 + s3, bd);
2210 x2 = WRAPLOW(s0 - s2); 2230 x2 = WRAPLOW(s0 - s2, bd);
2211 x3 = WRAPLOW(s1 - s3); 2231 x3 = WRAPLOW(s1 - s3, bd);
2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 2232 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 2233 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 2234 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 2235 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
2216 x8 = WRAPLOW(s8 + s10); 2236 x8 = WRAPLOW(s8 + s10, bd);
2217 x9 = WRAPLOW(s9 + s11); 2237 x9 = WRAPLOW(s9 + s11, bd);
2218 x10 = WRAPLOW(s8 - s10); 2238 x10 = WRAPLOW(s8 - s10, bd);
2219 x11 = WRAPLOW(s9 - s11); 2239 x11 = WRAPLOW(s9 - s11, bd);
2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); 2240 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); 2241 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); 2242 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); 2243 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
2224 2244
2225 // stage 4 2245 // stage 4
2226 s2 = (- cospi_16_64) * (x2 + x3); 2246 s2 = (- cospi_16_64) * (x2 + x3);
2227 s3 = cospi_16_64 * (x2 - x3); 2247 s3 = cospi_16_64 * (x2 - x3);
2228 s6 = cospi_16_64 * (x6 + x7); 2248 s6 = cospi_16_64 * (x6 + x7);
2229 s7 = cospi_16_64 * (-x6 + x7); 2249 s7 = cospi_16_64 * (-x6 + x7);
2230 s10 = cospi_16_64 * (x10 + x11); 2250 s10 = cospi_16_64 * (x10 + x11);
2231 s11 = cospi_16_64 * (-x10 + x11); 2251 s11 = cospi_16_64 * (-x10 + x11);
2232 s14 = (- cospi_16_64) * (x14 + x15); 2252 s14 = (- cospi_16_64) * (x14 + x15);
2233 s15 = cospi_16_64 * (x14 - x15); 2253 s15 = cospi_16_64 * (x14 - x15);
2234 2254
2235 x2 = WRAPLOW(dct_const_round_shift(s2)); 2255 x2 = WRAPLOW(dct_const_round_shift(s2), bd);
2236 x3 = WRAPLOW(dct_const_round_shift(s3)); 2256 x3 = WRAPLOW(dct_const_round_shift(s3), bd);
2237 x6 = WRAPLOW(dct_const_round_shift(s6)); 2257 x6 = WRAPLOW(dct_const_round_shift(s6), bd);
2238 x7 = WRAPLOW(dct_const_round_shift(s7)); 2258 x7 = WRAPLOW(dct_const_round_shift(s7), bd);
2239 x10 = WRAPLOW(dct_const_round_shift(s10)); 2259 x10 = WRAPLOW(dct_const_round_shift(s10), bd);
2240 x11 = WRAPLOW(dct_const_round_shift(s11)); 2260 x11 = WRAPLOW(dct_const_round_shift(s11), bd);
2241 x14 = WRAPLOW(dct_const_round_shift(s14)); 2261 x14 = WRAPLOW(dct_const_round_shift(s14), bd);
2242 x15 = WRAPLOW(dct_const_round_shift(s15)); 2262 x15 = WRAPLOW(dct_const_round_shift(s15), bd);
2243 2263
2244 output[0] = WRAPLOW(x0); 2264 output[0] = WRAPLOW(x0, bd);
2245 output[1] = WRAPLOW(-x8); 2265 output[1] = WRAPLOW(-x8, bd);
2246 output[2] = WRAPLOW(x12); 2266 output[2] = WRAPLOW(x12, bd);
2247 output[3] = WRAPLOW(-x4); 2267 output[3] = WRAPLOW(-x4, bd);
2248 output[4] = WRAPLOW(x6); 2268 output[4] = WRAPLOW(x6, bd);
2249 output[5] = WRAPLOW(x14); 2269 output[5] = WRAPLOW(x14, bd);
2250 output[6] = WRAPLOW(x10); 2270 output[6] = WRAPLOW(x10, bd);
2251 output[7] = WRAPLOW(x2); 2271 output[7] = WRAPLOW(x2, bd);
2252 output[8] = WRAPLOW(x3); 2272 output[8] = WRAPLOW(x3, bd);
2253 output[9] = WRAPLOW(x11); 2273 output[9] = WRAPLOW(x11, bd);
2254 output[10] = WRAPLOW(x15); 2274 output[10] = WRAPLOW(x15, bd);
2255 output[11] = WRAPLOW(x7); 2275 output[11] = WRAPLOW(x7, bd);
2256 output[12] = WRAPLOW(x5); 2276 output[12] = WRAPLOW(x5, bd);
2257 output[13] = WRAPLOW(-x13); 2277 output[13] = WRAPLOW(-x13, bd);
2258 output[14] = WRAPLOW(x9); 2278 output[14] = WRAPLOW(x9, bd);
2259 output[15] = WRAPLOW(-x1); 2279 output[15] = WRAPLOW(-x1, bd);
2260 } 2280 }
2261 2281
2262 static const high_transform_2d HIGH_IHT_16[] = { 2282 static const highbd_transform_2d HIGH_IHT_16[] = {
2263 { high_idct16, high_idct16 }, // DCT_DCT = 0 2283 { highbd_idct16, highbd_idct16 }, // DCT_DCT = 0
2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1 2284 { highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1
2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2 2285 { highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3 2286 { highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3
2267 }; 2287 };
2268 2288
2269 void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 2289 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2270 int stride, int tx_type, int bd) { 2290 int stride, int tx_type, int bd) {
2271 int i, j; 2291 int i, j;
2272 tran_low_t out[16 * 16]; 2292 tran_low_t out[16 * 16];
2273 tran_low_t *outptr = out; 2293 tran_low_t *outptr = out;
2274 tran_low_t temp_in[16], temp_out[16]; 2294 tran_low_t temp_in[16], temp_out[16];
2275 const high_transform_2d ht = HIGH_IHT_16[tx_type]; 2295 const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2296 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2277 2297
2278 // Rows 2298 // Rows
2279 for (i = 0; i < 16; ++i) { 2299 for (i = 0; i < 16; ++i) {
2280 ht.rows(input, outptr, bd); 2300 ht.rows(input, outptr, bd);
2281 input += 16; 2301 input += 16;
2282 outptr += 16; 2302 outptr += 16;
2283 } 2303 }
2284 2304
2285 // Columns 2305 // Columns
2286 for (i = 0; i < 16; ++i) { 2306 for (i = 0; i < 16; ++i) {
2287 for (j = 0; j < 16; ++j) 2307 for (j = 0; j < 16; ++j)
2288 temp_in[j] = out[j * 16 + i]; 2308 temp_in[j] = out[j * 16 + i];
2289 ht.cols(temp_in, temp_out, bd); 2309 ht.cols(temp_in, temp_out, bd);
2290 for (j = 0; j < 16; ++j) 2310 for (j = 0; j < 16; ++j) {
2291 dest[j * stride + i] = clip_pixel_bd_high( 2311 dest[j * stride + i] = highbd_clip_pixel_add(
2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2312 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2313 }
2293 } 2314 }
2294 } 2315 }
2295 2316
2296 void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, 2317 void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2297 int stride, int bd) { 2318 int stride, int bd) {
2298 tran_low_t out[16 * 16] = { 0 }; 2319 tran_low_t out[16 * 16] = { 0 };
2299 tran_low_t *outptr = out; 2320 tran_low_t *outptr = out;
2300 int i, j; 2321 int i, j;
2301 tran_low_t temp_in[16], temp_out[16]; 2322 tran_low_t temp_in[16], temp_out[16];
2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2323 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2303 2324
2304 // First transform rows. Since all non-zero dct coefficients are in 2325 // First transform rows. Since all non-zero dct coefficients are in
2305 // upper-left 4x4 area, we only need to calculate first 4 rows here. 2326 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2306 for (i = 0; i < 4; ++i) { 2327 for (i = 0; i < 4; ++i) {
2307 high_idct16(input, outptr, bd); 2328 highbd_idct16(input, outptr, bd);
2308 input += 16; 2329 input += 16;
2309 outptr += 16; 2330 outptr += 16;
2310 } 2331 }
2311 2332
2312 // Then transform columns. 2333 // Then transform columns.
2313 for (i = 0; i < 16; ++i) { 2334 for (i = 0; i < 16; ++i) {
2314 for (j = 0; j < 16; ++j) 2335 for (j = 0; j < 16; ++j)
2315 temp_in[j] = out[j*16 + i]; 2336 temp_in[j] = out[j*16 + i];
2316 high_idct16(temp_in, temp_out, bd); 2337 highbd_idct16(temp_in, temp_out, bd);
2317 for (j = 0; j < 16; ++j) 2338 for (j = 0; j < 16; ++j) {
2318 dest[j * stride + i] = clip_pixel_bd_high( 2339 dest[j * stride + i] = highbd_clip_pixel_add(
2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2340 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2341 }
2320 } 2342 }
2321 } 2343 }
2322 2344
2323 void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, 2345 void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2324 int stride, int bd) { 2346 int stride, int bd) {
2325 int i, j; 2347 int i, j;
2326 tran_high_t a1; 2348 tran_high_t a1;
2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 2349 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2350 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2329 2351
2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 2352 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2331 a1 = ROUND_POWER_OF_TWO(out, 6); 2353 a1 = ROUND_POWER_OF_TWO(out, 6);
2332 for (j = 0; j < 16; ++j) { 2354 for (j = 0; j < 16; ++j) {
2333 for (i = 0; i < 16; ++i) 2355 for (i = 0; i < 16; ++i)
2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 2356 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2335 dest += stride; 2357 dest += stride;
2336 } 2358 }
2337 } 2359 }
2338 2360
2339 static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { 2361 static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
2340 tran_low_t step1[32], step2[32]; 2362 tran_low_t step1[32], step2[32];
2341 tran_high_t temp1, temp2; 2363 tran_high_t temp1, temp2;
2342 (void) bd; 2364 (void) bd;
2343 2365
2344 // stage 1 2366 // stage 1
2345 step1[0] = input[0]; 2367 step1[0] = input[0];
2346 step1[1] = input[16]; 2368 step1[1] = input[16];
2347 step1[2] = input[8]; 2369 step1[2] = input[8];
2348 step1[3] = input[24]; 2370 step1[3] = input[24];
2349 step1[4] = input[4]; 2371 step1[4] = input[4];
2350 step1[5] = input[20]; 2372 step1[5] = input[20];
2351 step1[6] = input[12]; 2373 step1[6] = input[12];
2352 step1[7] = input[28]; 2374 step1[7] = input[28];
2353 step1[8] = input[2]; 2375 step1[8] = input[2];
2354 step1[9] = input[18]; 2376 step1[9] = input[18];
2355 step1[10] = input[10]; 2377 step1[10] = input[10];
2356 step1[11] = input[26]; 2378 step1[11] = input[26];
2357 step1[12] = input[6]; 2379 step1[12] = input[6];
2358 step1[13] = input[22]; 2380 step1[13] = input[22];
2359 step1[14] = input[14]; 2381 step1[14] = input[14];
2360 step1[15] = input[30]; 2382 step1[15] = input[30];
2361 2383
2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 2384 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 2385 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1)); 2386 step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2)); 2387 step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
2366 2388
2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 2389 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 2390 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 2391 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 2392 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
2371 2393
2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 2394 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 2395 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2396 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2397 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2376 2398
2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 2399 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 2400 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 2401 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 2402 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
2381 2403
2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 2404 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 2405 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2406 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2407 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2386 2408
2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 2409 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 2410 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2411 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2412 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2391 2413
2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 2414 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 2415 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2416 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2417 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2396 2418
2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 2419 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 2420 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 2421 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 2422 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
2401 2423
2402 // stage 2 2424 // stage 2
2403 step2[0] = step1[0]; 2425 step2[0] = step1[0];
2404 step2[1] = step1[1]; 2426 step2[1] = step1[1];
2405 step2[2] = step1[2]; 2427 step2[2] = step1[2];
2406 step2[3] = step1[3]; 2428 step2[3] = step1[3];
2407 step2[4] = step1[4]; 2429 step2[4] = step1[4];
2408 step2[5] = step1[5]; 2430 step2[5] = step1[5];
2409 step2[6] = step1[6]; 2431 step2[6] = step1[6];
2410 step2[7] = step1[7]; 2432 step2[7] = step1[7];
2411 2433
2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 2434 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 2435 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 2436 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 2437 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
2416 2438
2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 2439 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 2440 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 2441 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 2442 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
2421 2443
2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 2444 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 2445 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2446 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2447 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2426 2448
2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 2449 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 2450 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2451 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2452 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2431 2453
2432 step2[16] = WRAPLOW(step1[16] + step1[17]); 2454 step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2433 step2[17] = WRAPLOW(step1[16] - step1[17]); 2455 step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2434 step2[18] = WRAPLOW(-step1[18] + step1[19]); 2456 step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2435 step2[19] = WRAPLOW(step1[18] + step1[19]); 2457 step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2436 step2[20] = WRAPLOW(step1[20] + step1[21]); 2458 step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2437 step2[21] = WRAPLOW(step1[20] - step1[21]); 2459 step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2438 step2[22] = WRAPLOW(-step1[22] + step1[23]); 2460 step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2439 step2[23] = WRAPLOW(step1[22] + step1[23]); 2461 step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2440 step2[24] = WRAPLOW(step1[24] + step1[25]); 2462 step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2441 step2[25] = WRAPLOW(step1[24] - step1[25]); 2463 step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2442 step2[26] = WRAPLOW(-step1[26] + step1[27]); 2464 step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2443 step2[27] = WRAPLOW(step1[26] + step1[27]); 2465 step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2444 step2[28] = WRAPLOW(step1[28] + step1[29]); 2466 step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2445 step2[29] = WRAPLOW(step1[28] - step1[29]); 2467 step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2446 step2[30] = WRAPLOW(-step1[30] + step1[31]); 2468 step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2447 step2[31] = WRAPLOW(step1[30] + step1[31]); 2469 step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2448 2470
2449 // stage 3 2471 // stage 3
2450 step1[0] = step2[0]; 2472 step1[0] = step2[0];
2451 step1[1] = step2[1]; 2473 step1[1] = step2[1];
2452 step1[2] = step2[2]; 2474 step1[2] = step2[2];
2453 step1[3] = step2[3]; 2475 step1[3] = step2[3];
2454 2476
2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 2477 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 2478 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 2479 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 2480 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 2481 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 2482 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2483 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2484 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2463 2485
2464 step1[8] = WRAPLOW(step2[8] + step2[9]); 2486 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2465 step1[9] = WRAPLOW(step2[8] - step2[9]); 2487 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2466 step1[10] = WRAPLOW(-step2[10] + step2[11]); 2488 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2467 step1[11] = WRAPLOW(step2[10] + step2[11]); 2489 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2468 step1[12] = WRAPLOW(step2[12] + step2[13]); 2490 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2469 step1[13] = WRAPLOW(step2[12] - step2[13]); 2491 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2470 step1[14] = WRAPLOW(-step2[14] + step2[15]); 2492 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2471 step1[15] = WRAPLOW(step2[14] + step2[15]); 2493 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2472 2494
2473 step1[16] = step2[16]; 2495 step1[16] = step2[16];
2474 step1[31] = step2[31]; 2496 step1[31] = step2[31];
2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 2497 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 2498 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 2499 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 2500 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 2501 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 2502 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2503 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2504 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2483 step1[19] = step2[19]; 2505 step1[19] = step2[19];
2484 step1[20] = step2[20]; 2506 step1[20] = step2[20];
2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 2507 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 2508 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2509 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2510 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 2511 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 2512 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2513 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2514 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2493 step1[23] = step2[23]; 2515 step1[23] = step2[23];
2494 step1[24] = step2[24]; 2516 step1[24] = step2[24];
2495 step1[27] = step2[27]; 2517 step1[27] = step2[27];
2496 step1[28] = step2[28]; 2518 step1[28] = step2[28];
2497 2519
2498 // stage 4 2520 // stage 4
2499 temp1 = (step1[0] + step1[1]) * cospi_16_64; 2521 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2500 temp2 = (step1[0] - step1[1]) * cospi_16_64; 2522 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 2523 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 2524 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2525 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2526 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 2527 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 2528 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
2507 step2[4] = WRAPLOW(step1[4] + step1[5]); 2529 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2508 step2[5] = WRAPLOW(step1[4] - step1[5]); 2530 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2509 step2[6] = WRAPLOW(-step1[6] + step1[7]); 2531 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2510 step2[7] = WRAPLOW(step1[6] + step1[7]); 2532 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2511 2533
2512 step2[8] = step1[8]; 2534 step2[8] = step1[8];
2513 step2[15] = step1[15]; 2535 step2[15] = step1[15];
2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2536 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2537 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 2538 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 2539 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2540 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2541 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2542 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2543 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2522 step2[11] = step1[11]; 2544 step2[11] = step1[11];
2523 step2[12] = step1[12]; 2545 step2[12] = step1[12];
2524 2546
2525 step2[16] = WRAPLOW(step1[16] + step1[19]); 2547 step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2526 step2[17] = WRAPLOW(step1[17] + step1[18]); 2548 step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2527 step2[18] = WRAPLOW(step1[17] - step1[18]); 2549 step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2528 step2[19] = WRAPLOW(step1[16] - step1[19]); 2550 step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2529 step2[20] = WRAPLOW(-step1[20] + step1[23]); 2551 step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2530 step2[21] = WRAPLOW(-step1[21] + step1[22]); 2552 step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2531 step2[22] = WRAPLOW(step1[21] + step1[22]); 2553 step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2532 step2[23] = WRAPLOW(step1[20] + step1[23]); 2554 step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2533 2555
2534 step2[24] = WRAPLOW(step1[24] + step1[27]); 2556 step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2535 step2[25] = WRAPLOW(step1[25] + step1[26]); 2557 step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2536 step2[26] = WRAPLOW(step1[25] - step1[26]); 2558 step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2537 step2[27] = WRAPLOW(step1[24] - step1[27]); 2559 step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2538 step2[28] = WRAPLOW(-step1[28] + step1[31]); 2560 step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2539 step2[29] = WRAPLOW(-step1[29] + step1[30]); 2561 step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2540 step2[30] = WRAPLOW(step1[29] + step1[30]); 2562 step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2541 step2[31] = WRAPLOW(step1[28] + step1[31]); 2563 step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2542 2564
2543 // stage 5 2565 // stage 5
2544 step1[0] = WRAPLOW(step2[0] + step2[3]); 2566 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2545 step1[1] = WRAPLOW(step2[1] + step2[2]); 2567 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2546 step1[2] = WRAPLOW(step2[1] - step2[2]); 2568 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2547 step1[3] = WRAPLOW(step2[0] - step2[3]); 2569 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2548 step1[4] = step2[4]; 2570 step1[4] = step2[4];
2549 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2571 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2550 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2572 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2573 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2574 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
2553 step1[7] = step2[7]; 2575 step1[7] = step2[7];
2554 2576
2555 step1[8] = WRAPLOW(step2[8] + step2[11]); 2577 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2556 step1[9] = WRAPLOW(step2[9] + step2[10]); 2578 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2557 step1[10] = WRAPLOW(step2[9] - step2[10]); 2579 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2558 step1[11] = WRAPLOW(step2[8] - step2[11]); 2580 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2559 step1[12] = WRAPLOW(-step2[12] + step2[15]); 2581 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2560 step1[13] = WRAPLOW(-step2[13] + step2[14]); 2582 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2561 step1[14] = WRAPLOW(step2[13] + step2[14]); 2583 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2562 step1[15] = WRAPLOW(step2[12] + step2[15]); 2584 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2563 2585
2564 step1[16] = step2[16]; 2586 step1[16] = step2[16];
2565 step1[17] = step2[17]; 2587 step1[17] = step2[17];
2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 2588 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 2589 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2590 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2591 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 2592 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 2593 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 2594 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 2595 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 2596 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 2597 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2598 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2599 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 2600 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 2601 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2602 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2603 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2582 step1[22] = step2[22]; 2604 step1[22] = step2[22];
2583 step1[23] = step2[23]; 2605 step1[23] = step2[23];
2584 step1[24] = step2[24]; 2606 step1[24] = step2[24];
2585 step1[25] = step2[25]; 2607 step1[25] = step2[25];
2586 step1[30] = step2[30]; 2608 step1[30] = step2[30];
2587 step1[31] = step2[31]; 2609 step1[31] = step2[31];
2588 2610
2589 // stage 6 2611 // stage 6
2590 step2[0] = WRAPLOW(step1[0] + step1[7]); 2612 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2591 step2[1] = WRAPLOW(step1[1] + step1[6]); 2613 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2592 step2[2] = WRAPLOW(step1[2] + step1[5]); 2614 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2593 step2[3] = WRAPLOW(step1[3] + step1[4]); 2615 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2594 step2[4] = WRAPLOW(step1[3] - step1[4]); 2616 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2595 step2[5] = WRAPLOW(step1[2] - step1[5]); 2617 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2596 step2[6] = WRAPLOW(step1[1] - step1[6]); 2618 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2597 step2[7] = WRAPLOW(step1[0] - step1[7]); 2619 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2598 step2[8] = step1[8]; 2620 step2[8] = step1[8];
2599 step2[9] = step1[9]; 2621 step2[9] = step1[9];
2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2622 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2601 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2623 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2624 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2625 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2626 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2605 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2627 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2628 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2629 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
2608 step2[14] = WRAPLOW(step1[14]); 2630 step2[14] = step1[14];
2609 step2[15] = WRAPLOW(step1[15]); 2631 step2[15] = step1[15];
2610 2632
2611 step2[16] = WRAPLOW(step1[16] + step1[23]); 2633 step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2612 step2[17] = WRAPLOW(step1[17] + step1[22]); 2634 step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2613 step2[18] = WRAPLOW(step1[18] + step1[21]); 2635 step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2614 step2[19] = WRAPLOW(step1[19] + step1[20]); 2636 step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2615 step2[20] = WRAPLOW(step1[19] - step1[20]); 2637 step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2616 step2[21] = WRAPLOW(step1[18] - step1[21]); 2638 step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2617 step2[22] = WRAPLOW(step1[17] - step1[22]); 2639 step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2618 step2[23] = WRAPLOW(step1[16] - step1[23]); 2640 step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2619 2641
2620 step2[24] = WRAPLOW(-step1[24] + step1[31]); 2642 step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2621 step2[25] = WRAPLOW(-step1[25] + step1[30]); 2643 step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2622 step2[26] = WRAPLOW(-step1[26] + step1[29]); 2644 step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2623 step2[27] = WRAPLOW(-step1[27] + step1[28]); 2645 step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2624 step2[28] = WRAPLOW(step1[27] + step1[28]); 2646 step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2625 step2[29] = WRAPLOW(step1[26] + step1[29]); 2647 step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2626 step2[30] = WRAPLOW(step1[25] + step1[30]); 2648 step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2627 step2[31] = WRAPLOW(step1[24] + step1[31]); 2649 step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2628 2650
2629 // stage 7 2651 // stage 7
2630 step1[0] = WRAPLOW(step2[0] + step2[15]); 2652 step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2631 step1[1] = WRAPLOW(step2[1] + step2[14]); 2653 step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2632 step1[2] = WRAPLOW(step2[2] + step2[13]); 2654 step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2633 step1[3] = WRAPLOW(step2[3] + step2[12]); 2655 step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2634 step1[4] = WRAPLOW(step2[4] + step2[11]); 2656 step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2635 step1[5] = WRAPLOW(step2[5] + step2[10]); 2657 step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2636 step1[6] = WRAPLOW(step2[6] + step2[9]); 2658 step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2637 step1[7] = WRAPLOW(step2[7] + step2[8]); 2659 step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2638 step1[8] = WRAPLOW(step2[7] - step2[8]); 2660 step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2639 step1[9] = WRAPLOW(step2[6] - step2[9]); 2661 step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2640 step1[10] = WRAPLOW(step2[5] - step2[10]); 2662 step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2641 step1[11] = WRAPLOW(step2[4] - step2[11]); 2663 step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2642 step1[12] = WRAPLOW(step2[3] - step2[12]); 2664 step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2643 step1[13] = WRAPLOW(step2[2] - step2[13]); 2665 step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2644 step1[14] = WRAPLOW(step2[1] - step2[14]); 2666 step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2645 step1[15] = WRAPLOW(step2[0] - step2[15]); 2667 step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2646 2668
2647 step1[16] = step2[16]; 2669 step1[16] = step2[16];
2648 step1[17] = step2[17]; 2670 step1[17] = step2[17];
2649 step1[18] = step2[18]; 2671 step1[18] = step2[18];
2650 step1[19] = step2[19]; 2672 step1[19] = step2[19];
2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 2673 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2652 temp2 = (step2[20] + step2[27]) * cospi_16_64; 2674 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2675 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2676 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 2677 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2656 temp2 = (step2[21] + step2[26]) * cospi_16_64; 2678 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2679 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2680 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 2681 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2660 temp2 = (step2[22] + step2[25]) * cospi_16_64; 2682 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2683 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2684 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 2685 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2664 temp2 = (step2[23] + step2[24]) * cospi_16_64; 2686 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 2687 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 2688 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
2667 step1[28] = step2[28]; 2689 step1[28] = step2[28];
2668 step1[29] = step2[29]; 2690 step1[29] = step2[29];
2669 step1[30] = step2[30]; 2691 step1[30] = step2[30];
2670 step1[31] = step2[31]; 2692 step1[31] = step2[31];
2671 2693
2672 // final stage 2694 // final stage
2673 output[0] = WRAPLOW(step1[0] + step1[31]); 2695 output[0] = WRAPLOW(step1[0] + step1[31], bd);
2674 output[1] = WRAPLOW(step1[1] + step1[30]); 2696 output[1] = WRAPLOW(step1[1] + step1[30], bd);
2675 output[2] = WRAPLOW(step1[2] + step1[29]); 2697 output[2] = WRAPLOW(step1[2] + step1[29], bd);
2676 output[3] = WRAPLOW(step1[3] + step1[28]); 2698 output[3] = WRAPLOW(step1[3] + step1[28], bd);
2677 output[4] = WRAPLOW(step1[4] + step1[27]); 2699 output[4] = WRAPLOW(step1[4] + step1[27], bd);
2678 output[5] = WRAPLOW(step1[5] + step1[26]); 2700 output[5] = WRAPLOW(step1[5] + step1[26], bd);
2679 output[6] = WRAPLOW(step1[6] + step1[25]); 2701 output[6] = WRAPLOW(step1[6] + step1[25], bd);
2680 output[7] = WRAPLOW(step1[7] + step1[24]); 2702 output[7] = WRAPLOW(step1[7] + step1[24], bd);
2681 output[8] = WRAPLOW(step1[8] + step1[23]); 2703 output[8] = WRAPLOW(step1[8] + step1[23], bd);
2682 output[9] = WRAPLOW(step1[9] + step1[22]); 2704 output[9] = WRAPLOW(step1[9] + step1[22], bd);
2683 output[10] = WRAPLOW(step1[10] + step1[21]); 2705 output[10] = WRAPLOW(step1[10] + step1[21], bd);
2684 output[11] = WRAPLOW(step1[11] + step1[20]); 2706 output[11] = WRAPLOW(step1[11] + step1[20], bd);
2685 output[12] = WRAPLOW(step1[12] + step1[19]); 2707 output[12] = WRAPLOW(step1[12] + step1[19], bd);
2686 output[13] = WRAPLOW(step1[13] + step1[18]); 2708 output[13] = WRAPLOW(step1[13] + step1[18], bd);
2687 output[14] = WRAPLOW(step1[14] + step1[17]); 2709 output[14] = WRAPLOW(step1[14] + step1[17], bd);
2688 output[15] = WRAPLOW(step1[15] + step1[16]); 2710 output[15] = WRAPLOW(step1[15] + step1[16], bd);
2689 output[16] = WRAPLOW(step1[15] - step1[16]); 2711 output[16] = WRAPLOW(step1[15] - step1[16], bd);
2690 output[17] = WRAPLOW(step1[14] - step1[17]); 2712 output[17] = WRAPLOW(step1[14] - step1[17], bd);
2691 output[18] = WRAPLOW(step1[13] - step1[18]); 2713 output[18] = WRAPLOW(step1[13] - step1[18], bd);
2692 output[19] = WRAPLOW(step1[12] - step1[19]); 2714 output[19] = WRAPLOW(step1[12] - step1[19], bd);
2693 output[20] = WRAPLOW(step1[11] - step1[20]); 2715 output[20] = WRAPLOW(step1[11] - step1[20], bd);
2694 output[21] = WRAPLOW(step1[10] - step1[21]); 2716 output[21] = WRAPLOW(step1[10] - step1[21], bd);
2695 output[22] = WRAPLOW(step1[9] - step1[22]); 2717 output[22] = WRAPLOW(step1[9] - step1[22], bd);
2696 output[23] = WRAPLOW(step1[8] - step1[23]); 2718 output[23] = WRAPLOW(step1[8] - step1[23], bd);
2697 output[24] = WRAPLOW(step1[7] - step1[24]); 2719 output[24] = WRAPLOW(step1[7] - step1[24], bd);
2698 output[25] = WRAPLOW(step1[6] - step1[25]); 2720 output[25] = WRAPLOW(step1[6] - step1[25], bd);
2699 output[26] = WRAPLOW(step1[5] - step1[26]); 2721 output[26] = WRAPLOW(step1[5] - step1[26], bd);
2700 output[27] = WRAPLOW(step1[4] - step1[27]); 2722 output[27] = WRAPLOW(step1[4] - step1[27], bd);
2701 output[28] = WRAPLOW(step1[3] - step1[28]); 2723 output[28] = WRAPLOW(step1[3] - step1[28], bd);
2702 output[29] = WRAPLOW(step1[2] - step1[29]); 2724 output[29] = WRAPLOW(step1[2] - step1[29], bd);
2703 output[30] = WRAPLOW(step1[1] - step1[30]); 2725 output[30] = WRAPLOW(step1[1] - step1[30], bd);
2704 output[31] = WRAPLOW(step1[0] - step1[31]); 2726 output[31] = WRAPLOW(step1[0] - step1[31], bd);
2705 } 2727 }
2706 2728
2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, 2729 void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2708 int stride, int bd) { 2730 int stride, int bd) {
2709 tran_low_t out[32 * 32]; 2731 tran_low_t out[32 * 32];
2710 tran_low_t *outptr = out; 2732 tran_low_t *outptr = out;
2711 int i, j; 2733 int i, j;
2712 tran_low_t temp_in[32], temp_out[32]; 2734 tran_low_t temp_in[32], temp_out[32];
2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2735 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2714 2736
2715 // Rows 2737 // Rows
2716 for (i = 0; i < 32; ++i) { 2738 for (i = 0; i < 32; ++i) {
2717 tran_low_t zero_coeff[16]; 2739 tran_low_t zero_coeff[16];
2718 for (j = 0; j < 16; ++j) 2740 for (j = 0; j < 16; ++j)
2719 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 2741 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2720 for (j = 0; j < 8; ++j) 2742 for (j = 0; j < 8; ++j)
2721 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2743 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2722 for (j = 0; j < 4; ++j) 2744 for (j = 0; j < 4; ++j)
2723 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2745 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2724 for (j = 0; j < 2; ++j) 2746 for (j = 0; j < 2; ++j)
2725 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2747 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2726 2748
2727 if (zero_coeff[0] | zero_coeff[1]) 2749 if (zero_coeff[0] | zero_coeff[1])
2728 high_idct32(input, outptr, bd); 2750 highbd_idct32(input, outptr, bd);
2729 else 2751 else
2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); 2752 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
2731 input += 32; 2753 input += 32;
2732 outptr += 32; 2754 outptr += 32;
2733 } 2755 }
2734 2756
2735 // Columns 2757 // Columns
2736 for (i = 0; i < 32; ++i) { 2758 for (i = 0; i < 32; ++i) {
2737 for (j = 0; j < 32; ++j) 2759 for (j = 0; j < 32; ++j)
2738 temp_in[j] = out[j * 32 + i]; 2760 temp_in[j] = out[j * 32 + i];
2739 high_idct32(temp_in, temp_out, bd); 2761 highbd_idct32(temp_in, temp_out, bd);
2740 for (j = 0; j < 32; ++j) 2762 for (j = 0; j < 32; ++j) {
2741 dest[j * stride + i] = clip_pixel_bd_high( 2763 dest[j * stride + i] = highbd_clip_pixel_add(
2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2764 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2765 }
2743 } 2766 }
2744 } 2767 }
2745 2768
2746 void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, 2769 void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2747 int stride, int bd) { 2770 int stride, int bd) {
2748 tran_low_t out[32 * 32] = {0}; 2771 tran_low_t out[32 * 32] = {0};
2749 tran_low_t *outptr = out; 2772 tran_low_t *outptr = out;
2750 int i, j; 2773 int i, j;
2751 tran_low_t temp_in[32], temp_out[32]; 2774 tran_low_t temp_in[32], temp_out[32];
2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2775 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2753 2776
2754 // Rows 2777 // Rows
2755 // Only upper-left 8x8 has non-zero coeff. 2778 // Only upper-left 8x8 has non-zero coeff.
2756 for (i = 0; i < 8; ++i) { 2779 for (i = 0; i < 8; ++i) {
2757 high_idct32(input, outptr, bd); 2780 highbd_idct32(input, outptr, bd);
2758 input += 32; 2781 input += 32;
2759 outptr += 32; 2782 outptr += 32;
2760 } 2783 }
2761 // Columns 2784 // Columns
2762 for (i = 0; i < 32; ++i) { 2785 for (i = 0; i < 32; ++i) {
2763 for (j = 0; j < 32; ++j) 2786 for (j = 0; j < 32; ++j)
2764 temp_in[j] = out[j * 32 + i]; 2787 temp_in[j] = out[j * 32 + i];
2765 high_idct32(temp_in, temp_out, bd); 2788 highbd_idct32(temp_in, temp_out, bd);
2766 for (j = 0; j < 32; ++j) 2789 for (j = 0; j < 32; ++j) {
2767 dest[j * stride + i] = clip_pixel_bd_high( 2790 dest[j * stride + i] = highbd_clip_pixel_add(
2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2791 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2792 }
2769 } 2793 }
2770 } 2794 }
2771 2795
2772 void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, 2796 void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2773 int stride, int bd) { 2797 int stride, int bd) {
2774 int i, j; 2798 int i, j;
2775 int a1; 2799 int a1;
2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2800 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2777 2801
2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 2802 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 2803 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2780 a1 = ROUND_POWER_OF_TWO(out, 6); 2804 a1 = ROUND_POWER_OF_TWO(out, 6);
2781 2805
2782 for (j = 0; j < 32; ++j) { 2806 for (j = 0; j < 32; ++j) {
2783 for (i = 0; i < 32; ++i) 2807 for (i = 0; i < 32; ++i)
2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 2808 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2785 dest += stride; 2809 dest += stride;
2786 } 2810 }
2787 } 2811 }
2788 2812
2789 // idct 2813 // idct
2790 void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 2814 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2791 int eob, int bd) { 2815 int eob, int bd) {
2792 if (eob > 1) 2816 if (eob > 1)
2793 vp9_high_idct4x4_16_add(input, dest, stride, bd); 2817 vp9_highbd_idct4x4_16_add(input, dest, stride, bd);
2794 else 2818 else
2795 vp9_high_idct4x4_1_add(input, dest, stride, bd); 2819 vp9_highbd_idct4x4_1_add(input, dest, stride, bd);
2796 } 2820 }
2797 2821
2798 2822
2799 void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 2823 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2800 int eob, int bd) { 2824 int eob, int bd) {
2801 if (eob > 1) 2825 if (eob > 1)
2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd); 2826 vp9_highbd_iwht4x4_16_add(input, dest, stride, bd);
2803 else 2827 else
2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd); 2828 vp9_highbd_iwht4x4_1_add(input, dest, stride, bd);
2805 } 2829 }
2806 2830
2807 void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, 2831 void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
2808 int eob, int bd) { 2832 int eob, int bd) {
2809 // If dc is 1, then input[0] is the reconstructed value, do not need 2833 // If dc is 1, then input[0] is the reconstructed value, do not need
2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 2834 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
2811 2835
2812 // The calculation can be simplified if there are not many non-zero dct 2836 // The calculation can be simplified if there are not many non-zero dct
2813 // coefficients. Use eobs to decide what to do. 2837 // coefficients. Use eobs to decide what to do.
2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 2838 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
2815 // Combine that with code here. 2839 // Combine that with code here.
2816 // DC only DCT coefficient 2840 // DC only DCT coefficient
2817 if (eob == 1) { 2841 if (eob == 1) {
2818 vp9_high_idct8x8_1_add(input, dest, stride, bd); 2842 vp9_highbd_idct8x8_1_add(input, dest, stride, bd);
2819 } else if (eob <= 10) { 2843 } else if (eob <= 10) {
2820 vp9_high_idct8x8_10_add(input, dest, stride, bd); 2844 vp9_highbd_idct8x8_10_add(input, dest, stride, bd);
2821 } else { 2845 } else {
2822 vp9_high_idct8x8_64_add(input, dest, stride, bd); 2846 vp9_highbd_idct8x8_64_add(input, dest, stride, bd);
2823 } 2847 }
2824 } 2848 }
2825 2849
2826 void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, 2850 void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
2827 int eob, int bd) { 2851 int stride, int eob, int bd) {
2828 // The calculation can be simplified if there are not many non-zero dct 2852 // The calculation can be simplified if there are not many non-zero dct
2829 // coefficients. Use eobs to separate different cases. 2853 // coefficients. Use eobs to separate different cases.
2830 // DC only DCT coefficient. 2854 // DC only DCT coefficient.
2831 if (eob == 1) { 2855 if (eob == 1) {
2832 vp9_high_idct16x16_1_add(input, dest, stride, bd); 2856 vp9_highbd_idct16x16_1_add(input, dest, stride, bd);
2833 } else if (eob <= 10) { 2857 } else if (eob <= 10) {
2834 vp9_high_idct16x16_10_add(input, dest, stride, bd); 2858 vp9_highbd_idct16x16_10_add(input, dest, stride, bd);
2835 } else { 2859 } else {
2836 vp9_high_idct16x16_256_add(input, dest, stride, bd); 2860 vp9_highbd_idct16x16_256_add(input, dest, stride, bd);
2837 } 2861 }
2838 } 2862 }
2839 2863
2840 void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, 2864 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
2841 int eob, int bd) { 2865 int stride, int eob, int bd) {
2842 // Non-zero coeff only in upper-left 8x8 2866 // Non-zero coeff only in upper-left 8x8
2843 if (eob == 1) { 2867 if (eob == 1) {
2844 vp9_high_idct32x32_1_add(input, dest, stride, bd); 2868 vp9_highbd_idct32x32_1_add(input, dest, stride, bd);
2845 } else if (eob <= 34) { 2869 } else if (eob <= 34) {
2846 vp9_high_idct32x32_34_add(input, dest, stride, bd); 2870 vp9_highbd_idct32x32_34_add(input, dest, stride, bd);
2847 } else { 2871 } else {
2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd); 2872 vp9_highbd_idct32x32_1024_add(input, dest, stride, bd);
2849 } 2873 }
2850 } 2874 }
2851 2875
2852 // iht 2876 // iht
2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, 2877 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
2854 uint8_t *dest, int stride, int eob, int bd) { 2878 uint8_t *dest, int stride, int eob, int bd) {
2855 if (tx_type == DCT_DCT) 2879 if (tx_type == DCT_DCT)
2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd); 2880 vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
2857 else 2881 else
2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd); 2882 vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
2859 } 2883 }
2860 2884
2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, 2885 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
2862 uint8_t *dest, int stride, int eob, int bd) { 2886 uint8_t *dest, int stride, int eob, int bd) {
2863 if (tx_type == DCT_DCT) { 2887 if (tx_type == DCT_DCT) {
2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd); 2888 vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
2865 } else { 2889 } else {
2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd); 2890 vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
2867 } 2891 }
2868 } 2892 }
2869 2893
2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, 2894 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
2871 uint8_t *dest, int stride, int eob, int bd) { 2895 uint8_t *dest, int stride, int eob, int bd) {
2872 if (tx_type == DCT_DCT) { 2896 if (tx_type == DCT_DCT) {
2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd); 2897 vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
2874 } else { 2898 } else {
2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd); 2899 vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
2876 } 2900 }
2877 } 2901 }
2878 #endif // CONFIG_VP9_HIGHBITDEPTH 2902 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698