OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <math.h> | 12 #include <math.h> |
13 | 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
16 #include "vp9/common/vp9_systemdependent.h" | 16 #include "vp9/common/vp9_systemdependent.h" |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/vp9_common.h" | 18 #include "vp9/common/vp9_common.h" |
19 #include "vp9/common/vp9_idct.h" | 19 #include "vp9/common/vp9_idct.h" |
20 | 20 |
21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { | 21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH |
| 22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict |
| 23 // overflow wrapping to match expected hardware implementations. |
| 24 // bd of 8 uses trans_low with 16bits, need to remove 16bits |
| 25 // bd of 10 uses trans_low with 18bits, need to remove 14bits |
| 26 // bd of 12 uses trans_low with 20bits, need to remove 12bits |
| 27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits |
| 28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd)) |
| 29 #else |
| 30 #define WRAPLOW(x) (x) |
| 31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH |
| 32 |
| 33 #if CONFIG_VP9_HIGHBITDEPTH |
| 34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low, |
| 35 tran_low_t high) { |
| 36 return value < low ? low : (value > high ? high : value); |
| 37 } |
| 38 |
| 39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest, |
| 40 tran_high_t trans, int bd) { |
| 41 trans = WRAPLOW(trans); |
| 42 switch (bd) { |
| 43 case 8: |
| 44 default: |
| 45 return clamp_high(WRAPLOW(dest + trans), 0, 255); |
| 46 case 10: |
| 47 return clamp_high(WRAPLOW(dest + trans), 0, 1023); |
| 48 case 12: |
| 49 return clamp_high(WRAPLOW(dest + trans), 0, 4095); |
| 50 } |
| 51 } |
| 52 #endif // CONFIG_VP9_HIGHBITDEPTH |
| 53 |
| 54 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, | 55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, |
23 0.5 shifts per pixel. */ | 56 0.5 shifts per pixel. */ |
24 int i; | 57 int i; |
25 int16_t output[16]; | 58 tran_low_t output[16]; |
26 int a1, b1, c1, d1, e1; | 59 tran_high_t a1, b1, c1, d1, e1; |
27 const int16_t *ip = input; | 60 const tran_low_t *ip = input; |
28 int16_t *op = output; | 61 tran_low_t *op = output; |
29 | 62 |
30 for (i = 0; i < 4; i++) { | 63 for (i = 0; i < 4; i++) { |
31 a1 = ip[0] >> UNIT_QUANT_SHIFT; | 64 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
32 c1 = ip[1] >> UNIT_QUANT_SHIFT; | 65 c1 = ip[1] >> UNIT_QUANT_SHIFT; |
33 d1 = ip[2] >> UNIT_QUANT_SHIFT; | 66 d1 = ip[2] >> UNIT_QUANT_SHIFT; |
34 b1 = ip[3] >> UNIT_QUANT_SHIFT; | 67 b1 = ip[3] >> UNIT_QUANT_SHIFT; |
35 a1 += c1; | 68 a1 += c1; |
36 d1 -= b1; | 69 d1 -= b1; |
37 e1 = (a1 - d1) >> 1; | 70 e1 = (a1 - d1) >> 1; |
38 b1 = e1 - b1; | 71 b1 = e1 - b1; |
(...skipping 24 matching lines...) Expand all Loading... |
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); | 96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); |
64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); | 97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); |
65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); | 98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); |
66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); | 99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); |
67 | 100 |
68 ip++; | 101 ip++; |
69 dest++; | 102 dest++; |
70 } | 103 } |
71 } | 104 } |
72 | 105 |
73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { | 106 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { |
74 int i; | 107 int i; |
75 int a1, e1; | 108 tran_high_t a1, e1; |
76 int16_t tmp[4]; | 109 tran_low_t tmp[4]; |
77 const int16_t *ip = in; | 110 const tran_low_t *ip = in; |
78 int16_t *op = tmp; | 111 tran_low_t *op = tmp; |
79 | 112 |
80 a1 = ip[0] >> UNIT_QUANT_SHIFT; | 113 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
81 e1 = a1 >> 1; | 114 e1 = a1 >> 1; |
82 a1 -= e1; | 115 a1 -= e1; |
83 op[0] = a1; | 116 op[0] = a1; |
84 op[1] = op[2] = op[3] = e1; | 117 op[1] = op[2] = op[3] = e1; |
85 | 118 |
86 ip = tmp; | 119 ip = tmp; |
87 for (i = 0; i < 4; i++) { | 120 for (i = 0; i < 4; i++) { |
88 e1 = ip[0] >> 1; | 121 e1 = ip[0] >> 1; |
89 a1 = ip[0] - e1; | 122 a1 = ip[0] - e1; |
90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); | 123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); |
91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); | 124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); |
92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); | 125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); |
93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); | 126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); |
94 ip++; | 127 ip++; |
95 dest++; | 128 dest++; |
96 } | 129 } |
97 } | 130 } |
98 | 131 |
99 static void idct4(const int16_t *input, int16_t *output) { | 132 static void idct4(const tran_low_t *input, tran_low_t *output) { |
100 int16_t step[4]; | 133 tran_low_t step[4]; |
101 int temp1, temp2; | 134 tran_high_t temp1, temp2; |
102 // stage 1 | 135 // stage 1 |
103 temp1 = (input[0] + input[2]) * cospi_16_64; | 136 temp1 = (input[0] + input[2]) * cospi_16_64; |
104 temp2 = (input[0] - input[2]) * cospi_16_64; | 137 temp2 = (input[0] - input[2]) * cospi_16_64; |
105 step[0] = dct_const_round_shift(temp1); | 138 step[0] = dct_const_round_shift(temp1); |
106 step[1] = dct_const_round_shift(temp2); | 139 step[1] = dct_const_round_shift(temp2); |
107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; | 140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; |
108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; | 141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; |
109 step[2] = dct_const_round_shift(temp1); | 142 step[2] = dct_const_round_shift(temp1); |
110 step[3] = dct_const_round_shift(temp2); | 143 step[3] = dct_const_round_shift(temp2); |
111 | 144 |
112 // stage 2 | 145 // stage 2 |
113 output[0] = step[0] + step[3]; | 146 output[0] = step[0] + step[3]; |
114 output[1] = step[1] + step[2]; | 147 output[1] = step[1] + step[2]; |
115 output[2] = step[1] - step[2]; | 148 output[2] = step[1] - step[2]; |
116 output[3] = step[0] - step[3]; | 149 output[3] = step[0] - step[3]; |
117 } | 150 } |
118 | 151 |
119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { | 152 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
120 int16_t out[4 * 4]; | 153 tran_low_t out[4 * 4]; |
121 int16_t *outptr = out; | 154 tran_low_t *outptr = out; |
122 int i, j; | 155 int i, j; |
123 int16_t temp_in[4], temp_out[4]; | 156 tran_low_t temp_in[4], temp_out[4]; |
124 | 157 |
125 // Rows | 158 // Rows |
126 for (i = 0; i < 4; ++i) { | 159 for (i = 0; i < 4; ++i) { |
127 idct4(input, outptr); | 160 idct4(input, outptr); |
128 input += 4; | 161 input += 4; |
129 outptr += 4; | 162 outptr += 4; |
130 } | 163 } |
131 | 164 |
132 // Columns | 165 // Columns |
133 for (i = 0; i < 4; ++i) { | 166 for (i = 0; i < 4; ++i) { |
134 for (j = 0; j < 4; ++j) | 167 for (j = 0; j < 4; ++j) |
135 temp_in[j] = out[j * 4 + i]; | 168 temp_in[j] = out[j * 4 + i]; |
136 idct4(temp_in, temp_out); | 169 idct4(temp_in, temp_out); |
137 for (j = 0; j < 4; ++j) | 170 for (j = 0; j < 4; ++j) |
138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) | 171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) |
139 + dest[j * stride + i]); | 172 + dest[j * stride + i]); |
140 } | 173 } |
141 } | 174 } |
142 | 175 |
143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { | 176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, |
| 177 int dest_stride) { |
144 int i; | 178 int i; |
145 int a1; | 179 tran_high_t a1; |
146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); | 180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); |
147 out = dct_const_round_shift(out * cospi_16_64); | 181 out = dct_const_round_shift(out * cospi_16_64); |
148 a1 = ROUND_POWER_OF_TWO(out, 4); | 182 a1 = ROUND_POWER_OF_TWO(out, 4); |
149 | 183 |
150 for (i = 0; i < 4; i++) { | 184 for (i = 0; i < 4; i++) { |
151 dest[0] = clip_pixel(dest[0] + a1); | 185 dest[0] = clip_pixel(dest[0] + a1); |
152 dest[1] = clip_pixel(dest[1] + a1); | 186 dest[1] = clip_pixel(dest[1] + a1); |
153 dest[2] = clip_pixel(dest[2] + a1); | 187 dest[2] = clip_pixel(dest[2] + a1); |
154 dest[3] = clip_pixel(dest[3] + a1); | 188 dest[3] = clip_pixel(dest[3] + a1); |
155 dest += dest_stride; | 189 dest += dest_stride; |
156 } | 190 } |
157 } | 191 } |
158 | 192 |
159 static void idct8(const int16_t *input, int16_t *output) { | 193 static void idct8(const tran_low_t *input, tran_low_t *output) { |
160 int16_t step1[8], step2[8]; | 194 tran_low_t step1[8], step2[8]; |
161 int temp1, temp2; | 195 tran_high_t temp1, temp2; |
162 // stage 1 | 196 // stage 1 |
163 step1[0] = input[0]; | 197 step1[0] = input[0]; |
164 step1[2] = input[4]; | 198 step1[2] = input[4]; |
165 step1[1] = input[2]; | 199 step1[1] = input[2]; |
166 step1[3] = input[6]; | 200 step1[3] = input[6]; |
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; | 201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; |
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; | 202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; |
169 step1[4] = dct_const_round_shift(temp1); | 203 step1[4] = dct_const_round_shift(temp1); |
170 step1[7] = dct_const_round_shift(temp2); | 204 step1[7] = dct_const_round_shift(temp2); |
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; | 205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; |
(...skipping 22 matching lines...) Expand all Loading... |
194 output[0] = step1[0] + step1[7]; | 228 output[0] = step1[0] + step1[7]; |
195 output[1] = step1[1] + step1[6]; | 229 output[1] = step1[1] + step1[6]; |
196 output[2] = step1[2] + step1[5]; | 230 output[2] = step1[2] + step1[5]; |
197 output[3] = step1[3] + step1[4]; | 231 output[3] = step1[3] + step1[4]; |
198 output[4] = step1[3] - step1[4]; | 232 output[4] = step1[3] - step1[4]; |
199 output[5] = step1[2] - step1[5]; | 233 output[5] = step1[2] - step1[5]; |
200 output[6] = step1[1] - step1[6]; | 234 output[6] = step1[1] - step1[6]; |
201 output[7] = step1[0] - step1[7]; | 235 output[7] = step1[0] - step1[7]; |
202 } | 236 } |
203 | 237 |
204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { | 238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
205 int16_t out[8 * 8]; | 239 tran_low_t out[8 * 8]; |
206 int16_t *outptr = out; | 240 tran_low_t *outptr = out; |
207 int i, j; | 241 int i, j; |
208 int16_t temp_in[8], temp_out[8]; | 242 tran_low_t temp_in[8], temp_out[8]; |
209 | 243 |
210 // First transform rows | 244 // First transform rows |
211 for (i = 0; i < 8; ++i) { | 245 for (i = 0; i < 8; ++i) { |
212 idct8(input, outptr); | 246 idct8(input, outptr); |
213 input += 8; | 247 input += 8; |
214 outptr += 8; | 248 outptr += 8; |
215 } | 249 } |
216 | 250 |
217 // Then transform columns | 251 // Then transform columns |
218 for (i = 0; i < 8; ++i) { | 252 for (i = 0; i < 8; ++i) { |
219 for (j = 0; j < 8; ++j) | 253 for (j = 0; j < 8; ++j) |
220 temp_in[j] = out[j * 8 + i]; | 254 temp_in[j] = out[j * 8 + i]; |
221 idct8(temp_in, temp_out); | 255 idct8(temp_in, temp_out); |
222 for (j = 0; j < 8; ++j) | 256 for (j = 0; j < 8; ++j) |
223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | 257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
224 + dest[j * stride + i]); | 258 + dest[j * stride + i]); |
225 } | 259 } |
226 } | 260 } |
227 | 261 |
228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { | 262 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
229 int i, j; | 263 int i, j; |
230 int a1; | 264 tran_high_t a1; |
231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); | 265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); |
232 out = dct_const_round_shift(out * cospi_16_64); | 266 out = dct_const_round_shift(out * cospi_16_64); |
233 a1 = ROUND_POWER_OF_TWO(out, 5); | 267 a1 = ROUND_POWER_OF_TWO(out, 5); |
234 for (j = 0; j < 8; ++j) { | 268 for (j = 0; j < 8; ++j) { |
235 for (i = 0; i < 8; ++i) | 269 for (i = 0; i < 8; ++i) |
236 dest[i] = clip_pixel(dest[i] + a1); | 270 dest[i] = clip_pixel(dest[i] + a1); |
237 dest += stride; | 271 dest += stride; |
238 } | 272 } |
239 } | 273 } |
240 | 274 |
241 static void iadst4(const int16_t *input, int16_t *output) { | 275 static void iadst4(const tran_low_t *input, tran_low_t *output) { |
242 int s0, s1, s2, s3, s4, s5, s6, s7; | 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
243 | 277 |
244 int x0 = input[0]; | 278 tran_high_t x0 = input[0]; |
245 int x1 = input[1]; | 279 tran_high_t x1 = input[1]; |
246 int x2 = input[2]; | 280 tran_high_t x2 = input[2]; |
247 int x3 = input[3]; | 281 tran_high_t x3 = input[3]; |
248 | 282 |
249 if (!(x0 | x1 | x2 | x3)) { | 283 if (!(x0 | x1 | x2 | x3)) { |
250 output[0] = output[1] = output[2] = output[3] = 0; | 284 output[0] = output[1] = output[2] = output[3] = 0; |
251 return; | 285 return; |
252 } | 286 } |
253 | 287 |
254 s0 = sinpi_1_9 * x0; | 288 s0 = sinpi_1_9 * x0; |
255 s1 = sinpi_2_9 * x0; | 289 s1 = sinpi_2_9 * x0; |
256 s2 = sinpi_3_9 * x1; | 290 s2 = sinpi_3_9 * x1; |
257 s3 = sinpi_4_9 * x2; | 291 s3 = sinpi_4_9 * x2; |
(...skipping 15 matching lines...) Expand all Loading... |
273 // 1-D transform scaling factor is sqrt(2). | 307 // 1-D transform scaling factor is sqrt(2). |
274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) | 308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) |
275 // + 1b (addition) = 29b. | 309 // + 1b (addition) = 29b. |
276 // Hence the output bit depth is 15b. | 310 // Hence the output bit depth is 15b. |
277 output[0] = dct_const_round_shift(s0); | 311 output[0] = dct_const_round_shift(s0); |
278 output[1] = dct_const_round_shift(s1); | 312 output[1] = dct_const_round_shift(s1); |
279 output[2] = dct_const_round_shift(s2); | 313 output[2] = dct_const_round_shift(s2); |
280 output[3] = dct_const_round_shift(s3); | 314 output[3] = dct_const_round_shift(s3); |
281 } | 315 } |
282 | 316 |
283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, | 317 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, |
284 int tx_type) { | 318 int tx_type) { |
285 const transform_2d IHT_4[] = { | 319 const transform_2d IHT_4[] = { |
286 { idct4, idct4 }, // DCT_DCT = 0 | 320 { idct4, idct4 }, // DCT_DCT = 0 |
287 { iadst4, idct4 }, // ADST_DCT = 1 | 321 { iadst4, idct4 }, // ADST_DCT = 1 |
288 { idct4, iadst4 }, // DCT_ADST = 2 | 322 { idct4, iadst4 }, // DCT_ADST = 2 |
289 { iadst4, iadst4 } // ADST_ADST = 3 | 323 { iadst4, iadst4 } // ADST_ADST = 3 |
290 }; | 324 }; |
291 | 325 |
292 int i, j; | 326 int i, j; |
293 int16_t out[4 * 4]; | 327 tran_low_t out[4 * 4]; |
294 int16_t *outptr = out; | 328 tran_low_t *outptr = out; |
295 int16_t temp_in[4], temp_out[4]; | 329 tran_low_t temp_in[4], temp_out[4]; |
296 | 330 |
297 // inverse transform row vectors | 331 // inverse transform row vectors |
298 for (i = 0; i < 4; ++i) { | 332 for (i = 0; i < 4; ++i) { |
299 IHT_4[tx_type].rows(input, outptr); | 333 IHT_4[tx_type].rows(input, outptr); |
300 input += 4; | 334 input += 4; |
301 outptr += 4; | 335 outptr += 4; |
302 } | 336 } |
303 | 337 |
304 // inverse transform column vectors | 338 // inverse transform column vectors |
305 for (i = 0; i < 4; ++i) { | 339 for (i = 0; i < 4; ++i) { |
306 for (j = 0; j < 4; ++j) | 340 for (j = 0; j < 4; ++j) |
307 temp_in[j] = out[j * 4 + i]; | 341 temp_in[j] = out[j * 4 + i]; |
308 IHT_4[tx_type].cols(temp_in, temp_out); | 342 IHT_4[tx_type].cols(temp_in, temp_out); |
309 for (j = 0; j < 4; ++j) | 343 for (j = 0; j < 4; ++j) |
310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) | 344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) |
311 + dest[j * stride + i]); | 345 + dest[j * stride + i]); |
312 } | 346 } |
313 } | 347 } |
314 static void iadst8(const int16_t *input, int16_t *output) { | 348 static void iadst8(const tran_low_t *input, tran_low_t *output) { |
315 int s0, s1, s2, s3, s4, s5, s6, s7; | 349 int s0, s1, s2, s3, s4, s5, s6, s7; |
316 | 350 |
317 int x0 = input[7]; | 351 tran_high_t x0 = input[7]; |
318 int x1 = input[0]; | 352 tran_high_t x1 = input[0]; |
319 int x2 = input[5]; | 353 tran_high_t x2 = input[5]; |
320 int x3 = input[2]; | 354 tran_high_t x3 = input[2]; |
321 int x4 = input[3]; | 355 tran_high_t x4 = input[3]; |
322 int x5 = input[4]; | 356 tran_high_t x5 = input[4]; |
323 int x6 = input[1]; | 357 tran_high_t x6 = input[1]; |
324 int x7 = input[6]; | 358 tran_high_t x7 = input[6]; |
325 | 359 |
326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { | 360 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { |
327 output[0] = output[1] = output[2] = output[3] = output[4] | 361 output[0] = output[1] = output[2] = output[3] = output[4] |
328 = output[5] = output[6] = output[7] = 0; | 362 = output[5] = output[6] = output[7] = 0; |
329 return; | 363 return; |
330 } | 364 } |
331 | 365 |
332 // stage 1 | 366 // stage 1 |
333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | 367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | 368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
388 output[7] = -x1; | 422 output[7] = -x1; |
389 } | 423 } |
390 | 424 |
391 static const transform_2d IHT_8[] = { | 425 static const transform_2d IHT_8[] = { |
392 { idct8, idct8 }, // DCT_DCT = 0 | 426 { idct8, idct8 }, // DCT_DCT = 0 |
393 { iadst8, idct8 }, // ADST_DCT = 1 | 427 { iadst8, idct8 }, // ADST_DCT = 1 |
394 { idct8, iadst8 }, // DCT_ADST = 2 | 428 { idct8, iadst8 }, // DCT_ADST = 2 |
395 { iadst8, iadst8 } // ADST_ADST = 3 | 429 { iadst8, iadst8 } // ADST_ADST = 3 |
396 }; | 430 }; |
397 | 431 |
398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, | 432 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, |
399 int tx_type) { | 433 int tx_type) { |
400 int i, j; | 434 int i, j; |
401 int16_t out[8 * 8]; | 435 tran_low_t out[8 * 8]; |
402 int16_t *outptr = out; | 436 tran_low_t *outptr = out; |
403 int16_t temp_in[8], temp_out[8]; | 437 tran_low_t temp_in[8], temp_out[8]; |
404 const transform_2d ht = IHT_8[tx_type]; | 438 const transform_2d ht = IHT_8[tx_type]; |
405 | 439 |
406 // inverse transform row vectors | 440 // inverse transform row vectors |
407 for (i = 0; i < 8; ++i) { | 441 for (i = 0; i < 8; ++i) { |
408 ht.rows(input, outptr); | 442 ht.rows(input, outptr); |
409 input += 8; | 443 input += 8; |
410 outptr += 8; | 444 outptr += 8; |
411 } | 445 } |
412 | 446 |
413 // inverse transform column vectors | 447 // inverse transform column vectors |
414 for (i = 0; i < 8; ++i) { | 448 for (i = 0; i < 8; ++i) { |
415 for (j = 0; j < 8; ++j) | 449 for (j = 0; j < 8; ++j) |
416 temp_in[j] = out[j * 8 + i]; | 450 temp_in[j] = out[j * 8 + i]; |
417 ht.cols(temp_in, temp_out); | 451 ht.cols(temp_in, temp_out); |
418 for (j = 0; j < 8; ++j) | 452 for (j = 0; j < 8; ++j) |
419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | 453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
420 + dest[j * stride + i]); | 454 + dest[j * stride + i]); |
421 } | 455 } |
422 } | 456 } |
423 | 457 |
424 void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { | 458 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
425 int16_t out[8 * 8] = { 0 }; | 459 tran_low_t out[8 * 8] = { 0 }; |
426 int16_t *outptr = out; | 460 tran_low_t *outptr = out; |
427 int i, j; | 461 int i, j; |
428 int16_t temp_in[8], temp_out[8]; | 462 tran_low_t temp_in[8], temp_out[8]; |
429 | 463 |
430 // First transform rows | 464 // First transform rows |
431 // only first 4 row has non-zero coefs | 465 // only first 4 row has non-zero coefs |
432 for (i = 0; i < 4; ++i) { | 466 for (i = 0; i < 4; ++i) { |
433 idct8(input, outptr); | 467 idct8(input, outptr); |
434 input += 8; | 468 input += 8; |
435 outptr += 8; | 469 outptr += 8; |
436 } | 470 } |
437 | 471 |
438 // Then transform columns | 472 // Then transform columns |
439 for (i = 0; i < 8; ++i) { | 473 for (i = 0; i < 8; ++i) { |
440 for (j = 0; j < 8; ++j) | 474 for (j = 0; j < 8; ++j) |
441 temp_in[j] = out[j * 8 + i]; | 475 temp_in[j] = out[j * 8 + i]; |
442 idct8(temp_in, temp_out); | 476 idct8(temp_in, temp_out); |
443 for (j = 0; j < 8; ++j) | 477 for (j = 0; j < 8; ++j) |
444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | 478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
445 + dest[j * stride + i]); | 479 + dest[j * stride + i]); |
446 } | 480 } |
447 } | 481 } |
448 | 482 |
449 static void idct16(const int16_t *input, int16_t *output) { | 483 static void idct16(const tran_low_t *input, tran_low_t *output) { |
450 int16_t step1[16], step2[16]; | 484 tran_low_t step1[16], step2[16]; |
451 int temp1, temp2; | 485 tran_high_t temp1, temp2; |
452 | 486 |
453 // stage 1 | 487 // stage 1 |
454 step1[0] = input[0/2]; | 488 step1[0] = input[0/2]; |
455 step1[1] = input[16/2]; | 489 step1[1] = input[16/2]; |
456 step1[2] = input[8/2]; | 490 step1[2] = input[8/2]; |
457 step1[3] = input[24/2]; | 491 step1[3] = input[24/2]; |
458 step1[4] = input[4/2]; | 492 step1[4] = input[4/2]; |
459 step1[5] = input[20/2]; | 493 step1[5] = input[20/2]; |
460 step1[6] = input[12/2]; | 494 step1[6] = input[12/2]; |
461 step1[7] = input[28/2]; | 495 step1[7] = input[28/2]; |
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
604 output[8] = step2[7] - step2[8]; | 638 output[8] = step2[7] - step2[8]; |
605 output[9] = step2[6] - step2[9]; | 639 output[9] = step2[6] - step2[9]; |
606 output[10] = step2[5] - step2[10]; | 640 output[10] = step2[5] - step2[10]; |
607 output[11] = step2[4] - step2[11]; | 641 output[11] = step2[4] - step2[11]; |
608 output[12] = step2[3] - step2[12]; | 642 output[12] = step2[3] - step2[12]; |
609 output[13] = step2[2] - step2[13]; | 643 output[13] = step2[2] - step2[13]; |
610 output[14] = step2[1] - step2[14]; | 644 output[14] = step2[1] - step2[14]; |
611 output[15] = step2[0] - step2[15]; | 645 output[15] = step2[0] - step2[15]; |
612 } | 646 } |
613 | 647 |
614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { | 648 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, |
615 int16_t out[16 * 16]; | 649 int stride) { |
616 int16_t *outptr = out; | 650 tran_low_t out[16 * 16]; |
| 651 tran_low_t *outptr = out; |
617 int i, j; | 652 int i, j; |
618 int16_t temp_in[16], temp_out[16]; | 653 tran_low_t temp_in[16], temp_out[16]; |
619 | 654 |
620 // First transform rows | 655 // First transform rows |
621 for (i = 0; i < 16; ++i) { | 656 for (i = 0; i < 16; ++i) { |
622 idct16(input, outptr); | 657 idct16(input, outptr); |
623 input += 16; | 658 input += 16; |
624 outptr += 16; | 659 outptr += 16; |
625 } | 660 } |
626 | 661 |
627 // Then transform columns | 662 // Then transform columns |
628 for (i = 0; i < 16; ++i) { | 663 for (i = 0; i < 16; ++i) { |
629 for (j = 0; j < 16; ++j) | 664 for (j = 0; j < 16; ++j) |
630 temp_in[j] = out[j * 16 + i]; | 665 temp_in[j] = out[j * 16 + i]; |
631 idct16(temp_in, temp_out); | 666 idct16(temp_in, temp_out); |
632 for (j = 0; j < 16; ++j) | 667 for (j = 0; j < 16; ++j) |
633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
634 + dest[j * stride + i]); | 669 + dest[j * stride + i]); |
635 } | 670 } |
636 } | 671 } |
637 | 672 |
638 static void iadst16(const int16_t *input, int16_t *output) { | 673 static void iadst16(const tran_low_t *input, tran_low_t *output) { |
639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; |
| 675 tran_high_t s9, s10, s11, s12, s13, s14, s15; |
640 | 676 |
641 int x0 = input[15]; | 677 tran_high_t x0 = input[15]; |
642 int x1 = input[0]; | 678 tran_high_t x1 = input[0]; |
643 int x2 = input[13]; | 679 tran_high_t x2 = input[13]; |
644 int x3 = input[2]; | 680 tran_high_t x3 = input[2]; |
645 int x4 = input[11]; | 681 tran_high_t x4 = input[11]; |
646 int x5 = input[4]; | 682 tran_high_t x5 = input[4]; |
647 int x6 = input[9]; | 683 tran_high_t x6 = input[9]; |
648 int x7 = input[6]; | 684 tran_high_t x7 = input[6]; |
649 int x8 = input[7]; | 685 tran_high_t x8 = input[7]; |
650 int x9 = input[8]; | 686 tran_high_t x9 = input[8]; |
651 int x10 = input[5]; | 687 tran_high_t x10 = input[5]; |
652 int x11 = input[10]; | 688 tran_high_t x11 = input[10]; |
653 int x12 = input[3]; | 689 tran_high_t x12 = input[3]; |
654 int x13 = input[12]; | 690 tran_high_t x13 = input[12]; |
655 int x14 = input[1]; | 691 tran_high_t x14 = input[1]; |
656 int x15 = input[14]; | 692 tran_high_t x15 = input[14]; |
657 | 693 |
658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | 694 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 |
659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { | 695 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { |
660 output[0] = output[1] = output[2] = output[3] = output[4] | 696 output[0] = output[1] = output[2] = output[3] = output[4] |
661 = output[5] = output[6] = output[7] = output[8] | 697 = output[5] = output[6] = output[7] = output[8] |
662 = output[9] = output[10] = output[11] = output[12] | 698 = output[9] = output[10] = output[11] = output[12] |
663 = output[13] = output[14] = output[15] = 0; | 699 = output[13] = output[14] = output[15] = 0; |
664 return; | 700 return; |
665 } | 701 } |
666 | 702 |
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
806 output[15] = -x1; | 842 output[15] = -x1; |
807 } | 843 } |
808 | 844 |
809 static const transform_2d IHT_16[] = { | 845 static const transform_2d IHT_16[] = { |
810 { idct16, idct16 }, // DCT_DCT = 0 | 846 { idct16, idct16 }, // DCT_DCT = 0 |
811 { iadst16, idct16 }, // ADST_DCT = 1 | 847 { iadst16, idct16 }, // ADST_DCT = 1 |
812 { idct16, iadst16 }, // DCT_ADST = 2 | 848 { idct16, iadst16 }, // DCT_ADST = 2 |
813 { iadst16, iadst16 } // ADST_ADST = 3 | 849 { iadst16, iadst16 } // ADST_ADST = 3 |
814 }; | 850 }; |
815 | 851 |
816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, | 852 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, |
817 int tx_type) { | 853 int tx_type) { |
818 int i, j; | 854 int i, j; |
819 int16_t out[16 * 16]; | 855 tran_low_t out[16 * 16]; |
820 int16_t *outptr = out; | 856 tran_low_t *outptr = out; |
821 int16_t temp_in[16], temp_out[16]; | 857 tran_low_t temp_in[16], temp_out[16]; |
822 const transform_2d ht = IHT_16[tx_type]; | 858 const transform_2d ht = IHT_16[tx_type]; |
823 | 859 |
824 // Rows | 860 // Rows |
825 for (i = 0; i < 16; ++i) { | 861 for (i = 0; i < 16; ++i) { |
826 ht.rows(input, outptr); | 862 ht.rows(input, outptr); |
827 input += 16; | 863 input += 16; |
828 outptr += 16; | 864 outptr += 16; |
829 } | 865 } |
830 | 866 |
831 // Columns | 867 // Columns |
832 for (i = 0; i < 16; ++i) { | 868 for (i = 0; i < 16; ++i) { |
833 for (j = 0; j < 16; ++j) | 869 for (j = 0; j < 16; ++j) |
834 temp_in[j] = out[j * 16 + i]; | 870 temp_in[j] = out[j * 16 + i]; |
835 ht.cols(temp_in, temp_out); | 871 ht.cols(temp_in, temp_out); |
836 for (j = 0; j < 16; ++j) | 872 for (j = 0; j < 16; ++j) |
837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
838 + dest[j * stride + i]); | 874 + dest[j * stride + i]); |
839 } | 875 } |
840 } | 876 } |
841 | 877 |
842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { | 878 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, |
843 int16_t out[16 * 16] = { 0 }; | 879 int stride) { |
844 int16_t *outptr = out; | 880 tran_low_t out[16 * 16] = { 0 }; |
| 881 tran_low_t *outptr = out; |
845 int i, j; | 882 int i, j; |
846 int16_t temp_in[16], temp_out[16]; | 883 tran_low_t temp_in[16], temp_out[16]; |
847 | 884 |
848 // First transform rows. Since all non-zero dct coefficients are in | 885 // First transform rows. Since all non-zero dct coefficients are in |
849 // upper-left 4x4 area, we only need to calculate first 4 rows here. | 886 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
850 for (i = 0; i < 4; ++i) { | 887 for (i = 0; i < 4; ++i) { |
851 idct16(input, outptr); | 888 idct16(input, outptr); |
852 input += 16; | 889 input += 16; |
853 outptr += 16; | 890 outptr += 16; |
854 } | 891 } |
855 | 892 |
856 // Then transform columns | 893 // Then transform columns |
857 for (i = 0; i < 16; ++i) { | 894 for (i = 0; i < 16; ++i) { |
858 for (j = 0; j < 16; ++j) | 895 for (j = 0; j < 16; ++j) |
859 temp_in[j] = out[j*16 + i]; | 896 temp_in[j] = out[j*16 + i]; |
860 idct16(temp_in, temp_out); | 897 idct16(temp_in, temp_out); |
861 for (j = 0; j < 16; ++j) | 898 for (j = 0; j < 16; ++j) |
862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
863 + dest[j * stride + i]); | 900 + dest[j * stride + i]); |
864 } | 901 } |
865 } | 902 } |
866 | 903 |
867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { | 904 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
868 int i, j; | 905 int i, j; |
869 int a1; | 906 tran_high_t a1; |
870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); | 907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); |
871 out = dct_const_round_shift(out * cospi_16_64); | 908 out = dct_const_round_shift(out * cospi_16_64); |
872 a1 = ROUND_POWER_OF_TWO(out, 6); | 909 a1 = ROUND_POWER_OF_TWO(out, 6); |
873 for (j = 0; j < 16; ++j) { | 910 for (j = 0; j < 16; ++j) { |
874 for (i = 0; i < 16; ++i) | 911 for (i = 0; i < 16; ++i) |
875 dest[i] = clip_pixel(dest[i] + a1); | 912 dest[i] = clip_pixel(dest[i] + a1); |
876 dest += stride; | 913 dest += stride; |
877 } | 914 } |
878 } | 915 } |
879 | 916 |
880 static void idct32(const int16_t *input, int16_t *output) { | 917 static void idct32(const tran_low_t *input, tran_low_t *output) { |
881 int16_t step1[32], step2[32]; | 918 tran_low_t step1[32], step2[32]; |
882 int temp1, temp2; | 919 tran_high_t temp1, temp2; |
883 | 920 |
884 // stage 1 | 921 // stage 1 |
885 step1[0] = input[0]; | 922 step1[0] = input[0]; |
886 step1[1] = input[16]; | 923 step1[1] = input[16]; |
887 step1[2] = input[8]; | 924 step1[2] = input[8]; |
888 step1[3] = input[24]; | 925 step1[3] = input[24]; |
889 step1[4] = input[4]; | 926 step1[4] = input[4]; |
890 step1[5] = input[20]; | 927 step1[5] = input[20]; |
891 step1[6] = input[12]; | 928 step1[6] = input[12]; |
892 step1[7] = input[28]; | 929 step1[7] = input[28]; |
(...skipping 344 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1237 output[24] = step1[7] - step1[24]; | 1274 output[24] = step1[7] - step1[24]; |
1238 output[25] = step1[6] - step1[25]; | 1275 output[25] = step1[6] - step1[25]; |
1239 output[26] = step1[5] - step1[26]; | 1276 output[26] = step1[5] - step1[26]; |
1240 output[27] = step1[4] - step1[27]; | 1277 output[27] = step1[4] - step1[27]; |
1241 output[28] = step1[3] - step1[28]; | 1278 output[28] = step1[3] - step1[28]; |
1242 output[29] = step1[2] - step1[29]; | 1279 output[29] = step1[2] - step1[29]; |
1243 output[30] = step1[1] - step1[30]; | 1280 output[30] = step1[1] - step1[30]; |
1244 output[31] = step1[0] - step1[31]; | 1281 output[31] = step1[0] - step1[31]; |
1245 } | 1282 } |
1246 | 1283 |
1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { | 1284 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, |
1248 int16_t out[32 * 32]; | 1285 int stride) { |
1249 int16_t *outptr = out; | 1286 tran_low_t out[32 * 32]; |
| 1287 tran_low_t *outptr = out; |
1250 int i, j; | 1288 int i, j; |
1251 int16_t temp_in[32], temp_out[32]; | 1289 tran_low_t temp_in[32], temp_out[32]; |
1252 | 1290 |
1253 // Rows | 1291 // Rows |
1254 for (i = 0; i < 32; ++i) { | 1292 for (i = 0; i < 32; ++i) { |
1255 int16_t zero_coeff[16]; | 1293 int16_t zero_coeff[16]; |
1256 for (j = 0; j < 16; ++j) | 1294 for (j = 0; j < 16; ++j) |
1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; | 1295 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; |
1258 for (j = 0; j < 8; ++j) | 1296 for (j = 0; j < 8; ++j) |
1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; | 1297 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
1260 for (j = 0; j < 4; ++j) | 1298 for (j = 0; j < 4; ++j) |
1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; | 1299 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
1262 for (j = 0; j < 2; ++j) | 1300 for (j = 0; j < 2; ++j) |
1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; | 1301 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
1264 | 1302 |
1265 if (zero_coeff[0] | zero_coeff[1]) | 1303 if (zero_coeff[0] | zero_coeff[1]) |
1266 idct32(input, outptr); | 1304 idct32(input, outptr); |
1267 else | 1305 else |
1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32); | 1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); |
1269 input += 32; | 1307 input += 32; |
1270 outptr += 32; | 1308 outptr += 32; |
1271 } | 1309 } |
1272 | 1310 |
1273 // Columns | 1311 // Columns |
1274 for (i = 0; i < 32; ++i) { | 1312 for (i = 0; i < 32; ++i) { |
1275 for (j = 0; j < 32; ++j) | 1313 for (j = 0; j < 32; ++j) |
1276 temp_in[j] = out[j * 32 + i]; | 1314 temp_in[j] = out[j * 32 + i]; |
1277 idct32(temp_in, temp_out); | 1315 idct32(temp_in, temp_out); |
1278 for (j = 0; j < 32; ++j) | 1316 for (j = 0; j < 32; ++j) |
1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1280 + dest[j * stride + i]); | 1318 + dest[j * stride + i]); |
1281 } | 1319 } |
1282 } | 1320 } |
1283 | 1321 |
1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { | 1322 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, |
1285 int16_t out[32 * 32] = {0}; | 1323 int stride) { |
1286 int16_t *outptr = out; | 1324 tran_low_t out[32 * 32] = {0}; |
| 1325 tran_low_t *outptr = out; |
1287 int i, j; | 1326 int i, j; |
1288 int16_t temp_in[32], temp_out[32]; | 1327 tran_low_t temp_in[32], temp_out[32]; |
1289 | 1328 |
1290 // Rows | 1329 // Rows |
1291 // only upper-left 8x8 has non-zero coeff | 1330 // only upper-left 8x8 has non-zero coeff |
1292 for (i = 0; i < 8; ++i) { | 1331 for (i = 0; i < 8; ++i) { |
1293 idct32(input, outptr); | 1332 idct32(input, outptr); |
1294 input += 32; | 1333 input += 32; |
1295 outptr += 32; | 1334 outptr += 32; |
1296 } | 1335 } |
1297 | 1336 |
1298 // Columns | 1337 // Columns |
1299 for (i = 0; i < 32; ++i) { | 1338 for (i = 0; i < 32; ++i) { |
1300 for (j = 0; j < 32; ++j) | 1339 for (j = 0; j < 32; ++j) |
1301 temp_in[j] = out[j * 32 + i]; | 1340 temp_in[j] = out[j * 32 + i]; |
1302 idct32(temp_in, temp_out); | 1341 idct32(temp_in, temp_out); |
1303 for (j = 0; j < 32; ++j) | 1342 for (j = 0; j < 32; ++j) |
1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1305 + dest[j * stride + i]); | 1344 + dest[j * stride + i]); |
1306 } | 1345 } |
1307 } | 1346 } |
1308 | 1347 |
1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { | 1348 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { |
1310 int i, j; | 1349 int i, j; |
1311 int a1; | 1350 tran_high_t a1; |
1312 | 1351 |
1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); | 1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); |
1314 out = dct_const_round_shift(out * cospi_16_64); | 1353 out = dct_const_round_shift(out * cospi_16_64); |
1315 a1 = ROUND_POWER_OF_TWO(out, 6); | 1354 a1 = ROUND_POWER_OF_TWO(out, 6); |
1316 | 1355 |
1317 for (j = 0; j < 32; ++j) { | 1356 for (j = 0; j < 32; ++j) { |
1318 for (i = 0; i < 32; ++i) | 1357 for (i = 0; i < 32; ++i) |
1319 dest[i] = clip_pixel(dest[i] + a1); | 1358 dest[i] = clip_pixel(dest[i] + a1); |
1320 dest += stride; | 1359 dest += stride; |
1321 } | 1360 } |
1322 } | 1361 } |
1323 | 1362 |
1324 // idct | 1363 // idct |
1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { | 1364 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 1365 int eob) { |
1326 if (eob > 1) | 1366 if (eob > 1) |
1327 vp9_idct4x4_16_add(input, dest, stride); | 1367 vp9_idct4x4_16_add(input, dest, stride); |
1328 else | 1368 else |
1329 vp9_idct4x4_1_add(input, dest, stride); | 1369 vp9_idct4x4_1_add(input, dest, stride); |
1330 } | 1370 } |
1331 | 1371 |
1332 | 1372 |
1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { | 1373 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 1374 int eob) { |
1334 if (eob > 1) | 1375 if (eob > 1) |
1335 vp9_iwht4x4_16_add(input, dest, stride); | 1376 vp9_iwht4x4_16_add(input, dest, stride); |
1336 else | 1377 else |
1337 vp9_iwht4x4_1_add(input, dest, stride); | 1378 vp9_iwht4x4_1_add(input, dest, stride); |
1338 } | 1379 } |
1339 | 1380 |
1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { | 1381 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 1382 int eob) { |
1341 // If dc is 1, then input[0] is the reconstructed value, do not need | 1383 // If dc is 1, then input[0] is the reconstructed value, do not need |
1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. | 1384 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. |
1343 | 1385 |
1344 // The calculation can be simplified if there are not many non-zero dct | 1386 // The calculation can be simplified if there are not many non-zero dct |
1345 // coefficients. Use eobs to decide what to do. | 1387 // coefficients. Use eobs to decide what to do. |
1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. | 1388 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. |
1347 // Combine that with code here. | 1389 // Combine that with code here. |
1348 if (eob == 1) | 1390 if (eob == 1) |
1349 // DC only DCT coefficient | 1391 // DC only DCT coefficient |
1350 vp9_idct8x8_1_add(input, dest, stride); | 1392 vp9_idct8x8_1_add(input, dest, stride); |
1351 else if (eob <= 12) | 1393 else if (eob <= 12) |
1352 vp9_idct8x8_12_add(input, dest, stride); | 1394 vp9_idct8x8_12_add(input, dest, stride); |
1353 else | 1395 else |
1354 vp9_idct8x8_64_add(input, dest, stride); | 1396 vp9_idct8x8_64_add(input, dest, stride); |
1355 } | 1397 } |
1356 | 1398 |
1357 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, | 1399 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, |
1358 int eob) { | 1400 int eob) { |
1359 /* The calculation can be simplified if there are not many non-zero dct | 1401 /* The calculation can be simplified if there are not many non-zero dct |
1360 * coefficients. Use eobs to separate different cases. */ | 1402 * coefficients. Use eobs to separate different cases. */ |
1361 if (eob == 1) | 1403 if (eob == 1) |
1362 /* DC only DCT coefficient. */ | 1404 /* DC only DCT coefficient. */ |
1363 vp9_idct16x16_1_add(input, dest, stride); | 1405 vp9_idct16x16_1_add(input, dest, stride); |
1364 else if (eob <= 10) | 1406 else if (eob <= 10) |
1365 vp9_idct16x16_10_add(input, dest, stride); | 1407 vp9_idct16x16_10_add(input, dest, stride); |
1366 else | 1408 else |
1367 vp9_idct16x16_256_add(input, dest, stride); | 1409 vp9_idct16x16_256_add(input, dest, stride); |
1368 } | 1410 } |
1369 | 1411 |
1370 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, | 1412 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, |
1371 int eob) { | 1413 int eob) { |
1372 if (eob == 1) | 1414 if (eob == 1) |
1373 vp9_idct32x32_1_add(input, dest, stride); | 1415 vp9_idct32x32_1_add(input, dest, stride); |
1374 else if (eob <= 34) | 1416 else if (eob <= 34) |
1375 // non-zero coeff only in upper-left 8x8 | 1417 // non-zero coeff only in upper-left 8x8 |
1376 vp9_idct32x32_34_add(input, dest, stride); | 1418 vp9_idct32x32_34_add(input, dest, stride); |
1377 else | 1419 else |
1378 vp9_idct32x32_1024_add(input, dest, stride); | 1420 vp9_idct32x32_1024_add(input, dest, stride); |
1379 } | 1421 } |
1380 | 1422 |
1381 // iht | 1423 // iht |
1382 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, | 1424 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, |
1383 int stride, int eob) { | 1425 int stride, int eob) { |
1384 if (tx_type == DCT_DCT) | 1426 if (tx_type == DCT_DCT) |
1385 vp9_idct4x4_add(input, dest, stride, eob); | 1427 vp9_idct4x4_add(input, dest, stride, eob); |
1386 else | 1428 else |
1387 vp9_iht4x4_16_add(input, dest, stride, tx_type); | 1429 vp9_iht4x4_16_add(input, dest, stride, tx_type); |
1388 } | 1430 } |
1389 | 1431 |
1390 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, | 1432 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, |
1391 int stride, int eob) { | 1433 int stride, int eob) { |
1392 if (tx_type == DCT_DCT) { | 1434 if (tx_type == DCT_DCT) { |
1393 vp9_idct8x8_add(input, dest, stride, eob); | 1435 vp9_idct8x8_add(input, dest, stride, eob); |
1394 } else { | 1436 } else { |
1395 vp9_iht8x8_64_add(input, dest, stride, tx_type); | 1437 vp9_iht8x8_64_add(input, dest, stride, tx_type); |
1396 } | 1438 } |
1397 } | 1439 } |
1398 | 1440 |
1399 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, | 1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, |
1400 int stride, int eob) { | 1442 int stride, int eob) { |
1401 if (tx_type == DCT_DCT) { | 1443 if (tx_type == DCT_DCT) { |
1402 vp9_idct16x16_add(input, dest, stride, eob); | 1444 vp9_idct16x16_add(input, dest, stride, eob); |
1403 } else { | 1445 } else { |
1404 vp9_iht16x16_256_add(input, dest, stride, tx_type); | 1446 vp9_iht16x16_256_add(input, dest, stride, tx_type); |
1405 } | 1447 } |
1406 } | 1448 } |
| 1449 |
| 1450 #if CONFIG_VP9_HIGHBITDEPTH |
| 1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1452 int stride, int bd) { |
| 1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, |
| 1454 0.5 shifts per pixel. */ |
| 1455 int i; |
| 1456 tran_low_t output[16]; |
| 1457 tran_high_t a1, b1, c1, d1, e1; |
| 1458 const tran_low_t *ip = input; |
| 1459 tran_low_t *op = output; |
| 1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1461 |
| 1462 for (i = 0; i < 4; i++) { |
| 1463 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
| 1464 c1 = ip[1] >> UNIT_QUANT_SHIFT; |
| 1465 d1 = ip[2] >> UNIT_QUANT_SHIFT; |
| 1466 b1 = ip[3] >> UNIT_QUANT_SHIFT; |
| 1467 a1 += c1; |
| 1468 d1 -= b1; |
| 1469 e1 = (a1 - d1) >> 1; |
| 1470 b1 = e1 - b1; |
| 1471 c1 = e1 - c1; |
| 1472 a1 -= b1; |
| 1473 d1 += c1; |
| 1474 op[0] = WRAPLOW(a1); |
| 1475 op[1] = WRAPLOW(b1); |
| 1476 op[2] = WRAPLOW(c1); |
| 1477 op[3] = WRAPLOW(d1); |
| 1478 ip += 4; |
| 1479 op += 4; |
| 1480 } |
| 1481 |
| 1482 ip = output; |
| 1483 for (i = 0; i < 4; i++) { |
| 1484 a1 = ip[4 * 0]; |
| 1485 c1 = ip[4 * 1]; |
| 1486 d1 = ip[4 * 2]; |
| 1487 b1 = ip[4 * 3]; |
| 1488 a1 += c1; |
| 1489 d1 -= b1; |
| 1490 e1 = (a1 - d1) >> 1; |
| 1491 b1 = e1 - b1; |
| 1492 c1 = e1 - c1; |
| 1493 a1 -= b1; |
| 1494 d1 += c1; |
| 1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd); |
| 1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd); |
| 1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd); |
| 1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd); |
| 1499 |
| 1500 ip++; |
| 1501 dest++; |
| 1502 } |
| 1503 } |
| 1504 |
| 1505 static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) { |
| 1506 tran_low_t step[4]; |
| 1507 tran_high_t temp1, temp2; |
| 1508 (void) bd; |
| 1509 // stage 1 |
| 1510 temp1 = (input[0] + input[2]) * cospi_16_64; |
| 1511 temp2 = (input[0] - input[2]) * cospi_16_64; |
| 1512 step[0] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1513 step[1] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; |
| 1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; |
| 1516 step[2] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1517 step[3] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1518 |
| 1519 // stage 2 |
| 1520 output[0] = WRAPLOW(step[0] + step[3]); |
| 1521 output[1] = WRAPLOW(step[1] + step[2]); |
| 1522 output[2] = WRAPLOW(step[1] - step[2]); |
| 1523 output[3] = WRAPLOW(step[0] - step[3]); |
| 1524 } |
| 1525 |
| 1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, |
| 1527 int dest_stride, int bd) { |
| 1528 int i; |
| 1529 tran_high_t a1, e1; |
| 1530 tran_low_t tmp[4]; |
| 1531 const tran_low_t *ip = in; |
| 1532 tran_low_t *op = tmp; |
| 1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1534 (void) bd; |
| 1535 |
| 1536 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
| 1537 e1 = a1 >> 1; |
| 1538 a1 -= e1; |
| 1539 op[0] = WRAPLOW(a1); |
| 1540 op[1] = op[2] = op[3] = WRAPLOW(e1); |
| 1541 |
| 1542 ip = tmp; |
| 1543 for (i = 0; i < 4; i++) { |
| 1544 e1 = ip[0] >> 1; |
| 1545 a1 = ip[0] - e1; |
| 1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd); |
| 1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd); |
| 1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd); |
| 1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd); |
| 1550 ip++; |
| 1551 dest++; |
| 1552 } |
| 1553 } |
| 1554 |
| 1555 void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1556 int stride, int bd) { |
| 1557 tran_low_t out[4 * 4]; |
| 1558 tran_low_t *outptr = out; |
| 1559 int i, j; |
| 1560 tran_low_t temp_in[4], temp_out[4]; |
| 1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1562 |
| 1563 // Rows |
| 1564 for (i = 0; i < 4; ++i) { |
| 1565 high_idct4(input, outptr, bd); |
| 1566 input += 4; |
| 1567 outptr += 4; |
| 1568 } |
| 1569 |
| 1570 // Columns |
| 1571 for (i = 0; i < 4; ++i) { |
| 1572 for (j = 0; j < 4; ++j) |
| 1573 temp_in[j] = out[j * 4 + i]; |
| 1574 high_idct4(temp_in, temp_out, bd); |
| 1575 for (j = 0; j < 4; ++j) |
| 1576 dest[j * stride + i] = clip_pixel_bd_high( |
| 1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); |
| 1578 } |
| 1579 } |
| 1580 |
| 1581 void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1582 int dest_stride, int bd) { |
| 1583 int i; |
| 1584 tran_high_t a1; |
| 1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); |
| 1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1587 |
| 1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); |
| 1589 a1 = ROUND_POWER_OF_TWO(out, 4); |
| 1590 |
| 1591 for (i = 0; i < 4; i++) { |
| 1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd); |
| 1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd); |
| 1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd); |
| 1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd); |
| 1596 dest += dest_stride; |
| 1597 } |
| 1598 } |
| 1599 |
| 1600 static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { |
| 1601 tran_low_t step1[8], step2[8]; |
| 1602 tran_high_t temp1, temp2; |
| 1603 // stage 1 |
| 1604 step1[0] = input[0]; |
| 1605 step1[2] = input[4]; |
| 1606 step1[1] = input[2]; |
| 1607 step1[3] = input[6]; |
| 1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; |
| 1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; |
| 1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; |
| 1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; |
| 1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1616 |
| 1617 // stage 2 & stage 3 - even half |
| 1618 high_idct4(step1, step1, bd); |
| 1619 |
| 1620 // stage 2 - odd half |
| 1621 step2[4] = WRAPLOW(step1[4] + step1[5]); |
| 1622 step2[5] = WRAPLOW(step1[4] - step1[5]); |
| 1623 step2[6] = WRAPLOW(-step1[6] + step1[7]); |
| 1624 step2[7] = WRAPLOW(step1[6] + step1[7]); |
| 1625 |
| 1626 // stage 3 - odd half |
| 1627 step1[4] = step2[4]; |
| 1628 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
| 1629 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
| 1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1632 step1[7] = step2[7]; |
| 1633 |
| 1634 // stage 4 |
| 1635 output[0] = WRAPLOW(step1[0] + step1[7]); |
| 1636 output[1] = WRAPLOW(step1[1] + step1[6]); |
| 1637 output[2] = WRAPLOW(step1[2] + step1[5]); |
| 1638 output[3] = WRAPLOW(step1[3] + step1[4]); |
| 1639 output[4] = WRAPLOW(step1[3] - step1[4]); |
| 1640 output[5] = WRAPLOW(step1[2] - step1[5]); |
| 1641 output[6] = WRAPLOW(step1[1] - step1[6]); |
| 1642 output[7] = WRAPLOW(step1[0] - step1[7]); |
| 1643 } |
| 1644 |
| 1645 void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1646 int stride, int bd) { |
| 1647 tran_low_t out[8 * 8]; |
| 1648 tran_low_t *outptr = out; |
| 1649 int i, j; |
| 1650 tran_low_t temp_in[8], temp_out[8]; |
| 1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1652 |
| 1653 // First transform rows. |
| 1654 for (i = 0; i < 8; ++i) { |
| 1655 high_idct8(input, outptr, bd); |
| 1656 input += 8; |
| 1657 outptr += 8; |
| 1658 } |
| 1659 |
| 1660 // Then transform columns. |
| 1661 for (i = 0; i < 8; ++i) { |
| 1662 for (j = 0; j < 8; ++j) |
| 1663 temp_in[j] = out[j * 8 + i]; |
| 1664 high_idct8(temp_in, temp_out, bd); |
| 1665 for (j = 0; j < 8; ++j) |
| 1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i], |
| 1667 ROUND_POWER_OF_TWO(temp_out[j], 5), |
| 1668 bd); |
| 1669 } |
| 1670 } |
| 1671 |
| 1672 void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1673 int stride, int bd) { |
| 1674 int i, j; |
| 1675 tran_high_t a1; |
| 1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); |
| 1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); |
| 1679 a1 = ROUND_POWER_OF_TWO(out, 5); |
| 1680 for (j = 0; j < 8; ++j) { |
| 1681 for (i = 0; i < 8; ++i) |
| 1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); |
| 1683 dest += stride; |
| 1684 } |
| 1685 } |
| 1686 |
| 1687 static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { |
| 1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
| 1689 |
| 1690 tran_high_t x0 = input[0]; |
| 1691 tran_high_t x1 = input[1]; |
| 1692 tran_high_t x2 = input[2]; |
| 1693 tran_high_t x3 = input[3]; |
| 1694 (void) bd; |
| 1695 |
| 1696 if (!(x0 | x1 | x2 | x3)) { |
| 1697 vpx_memset(output, 0, 4 * sizeof(*output)); |
| 1698 return; |
| 1699 } |
| 1700 |
| 1701 s0 = sinpi_1_9 * x0; |
| 1702 s1 = sinpi_2_9 * x0; |
| 1703 s2 = sinpi_3_9 * x1; |
| 1704 s3 = sinpi_4_9 * x2; |
| 1705 s4 = sinpi_1_9 * x2; |
| 1706 s5 = sinpi_2_9 * x3; |
| 1707 s6 = sinpi_4_9 * x3; |
| 1708 s7 = x0 - x2 + x3; |
| 1709 |
| 1710 x0 = s0 + s3 + s5; |
| 1711 x1 = s1 - s4 - s6; |
| 1712 x2 = sinpi_3_9 * s7; |
| 1713 x3 = s2; |
| 1714 |
| 1715 s0 = x0 + x3; |
| 1716 s1 = x1 + x3; |
| 1717 s2 = x2; |
| 1718 s3 = x0 + x1 - x3; |
| 1719 |
| 1720 // 1-D transform scaling factor is sqrt(2). |
| 1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) |
| 1722 // + 1b (addition) = 29b. |
| 1723 // Hence the output bit depth is 15b. |
| 1724 output[0] = WRAPLOW(dct_const_round_shift(s0)); |
| 1725 output[1] = WRAPLOW(dct_const_round_shift(s1)); |
| 1726 output[2] = WRAPLOW(dct_const_round_shift(s2)); |
| 1727 output[3] = WRAPLOW(dct_const_round_shift(s3)); |
| 1728 } |
| 1729 |
| 1730 void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1731 int stride, int tx_type, int bd) { |
| 1732 const high_transform_2d IHT_4[] = { |
| 1733 { high_idct4, high_idct4 }, // DCT_DCT = 0 |
| 1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1 |
| 1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2 |
| 1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3 |
| 1737 }; |
| 1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1739 |
| 1740 int i, j; |
| 1741 tran_low_t out[4 * 4]; |
| 1742 tran_low_t *outptr = out; |
| 1743 tran_low_t temp_in[4], temp_out[4]; |
| 1744 |
| 1745 // Inverse transform row vectors. |
| 1746 for (i = 0; i < 4; ++i) { |
| 1747 IHT_4[tx_type].rows(input, outptr, bd); |
| 1748 input += 4; |
| 1749 outptr += 4; |
| 1750 } |
| 1751 |
| 1752 // Inverse transform column vectors. |
| 1753 for (i = 0; i < 4; ++i) { |
| 1754 for (j = 0; j < 4; ++j) |
| 1755 temp_in[j] = out[j * 4 + i]; |
| 1756 IHT_4[tx_type].cols(temp_in, temp_out, bd); |
| 1757 for (j = 0; j < 4; ++j) |
| 1758 dest[j * stride + i] = clip_pixel_bd_high( |
| 1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); |
| 1760 } |
| 1761 } |
| 1762 |
| 1763 static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { |
| 1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
| 1765 |
| 1766 tran_high_t x0 = input[7]; |
| 1767 tran_high_t x1 = input[0]; |
| 1768 tran_high_t x2 = input[5]; |
| 1769 tran_high_t x3 = input[2]; |
| 1770 tran_high_t x4 = input[3]; |
| 1771 tran_high_t x5 = input[4]; |
| 1772 tran_high_t x6 = input[1]; |
| 1773 tran_high_t x7 = input[6]; |
| 1774 (void) bd; |
| 1775 |
| 1776 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { |
| 1777 vpx_memset(output, 0, 8 * sizeof(*output)); |
| 1778 return; |
| 1779 } |
| 1780 |
| 1781 // stage 1 |
| 1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
| 1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
| 1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
| 1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
| 1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
| 1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
| 1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
| 1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
| 1790 |
| 1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); |
| 1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); |
| 1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); |
| 1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); |
| 1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); |
| 1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); |
| 1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); |
| 1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); |
| 1799 |
| 1800 // stage 2 |
| 1801 s0 = x0; |
| 1802 s1 = x1; |
| 1803 s2 = x2; |
| 1804 s3 = x3; |
| 1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; |
| 1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; |
| 1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; |
| 1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; |
| 1809 |
| 1810 x0 = s0 + s2; |
| 1811 x1 = s1 + s3; |
| 1812 x2 = s0 - s2; |
| 1813 x3 = s1 - s3; |
| 1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); |
| 1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); |
| 1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); |
| 1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); |
| 1818 |
| 1819 // stage 3 |
| 1820 s2 = cospi_16_64 * (x2 + x3); |
| 1821 s3 = cospi_16_64 * (x2 - x3); |
| 1822 s6 = cospi_16_64 * (x6 + x7); |
| 1823 s7 = cospi_16_64 * (x6 - x7); |
| 1824 |
| 1825 x2 = WRAPLOW(dct_const_round_shift(s2)); |
| 1826 x3 = WRAPLOW(dct_const_round_shift(s3)); |
| 1827 x6 = WRAPLOW(dct_const_round_shift(s6)); |
| 1828 x7 = WRAPLOW(dct_const_round_shift(s7)); |
| 1829 |
| 1830 output[0] = WRAPLOW(x0); |
| 1831 output[1] = WRAPLOW(-x4); |
| 1832 output[2] = WRAPLOW(x6); |
| 1833 output[3] = WRAPLOW(-x2); |
| 1834 output[4] = WRAPLOW(x3); |
| 1835 output[5] = WRAPLOW(-x7); |
| 1836 output[6] = WRAPLOW(x5); |
| 1837 output[7] = WRAPLOW(-x1); |
| 1838 } |
| 1839 |
| 1840 static const high_transform_2d HIGH_IHT_8[] = { |
| 1841 { high_idct8, high_idct8 }, // DCT_DCT = 0 |
| 1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1 |
| 1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2 |
| 1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3 |
| 1845 }; |
| 1846 |
| 1847 void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1848 int stride, int tx_type, int bd) { |
| 1849 int i, j; |
| 1850 tran_low_t out[8 * 8]; |
| 1851 tran_low_t *outptr = out; |
| 1852 tran_low_t temp_in[8], temp_out[8]; |
| 1853 const high_transform_2d ht = HIGH_IHT_8[tx_type]; |
| 1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1855 |
| 1856 // Inverse transform row vectors. |
| 1857 for (i = 0; i < 8; ++i) { |
| 1858 ht.rows(input, outptr, bd); |
| 1859 input += 8; |
| 1860 outptr += 8; |
| 1861 } |
| 1862 |
| 1863 // Inverse transform column vectors. |
| 1864 for (i = 0; i < 8; ++i) { |
| 1865 for (j = 0; j < 8; ++j) |
| 1866 temp_in[j] = out[j * 8 + i]; |
| 1867 ht.cols(temp_in, temp_out, bd); |
| 1868 for (j = 0; j < 8; ++j) |
| 1869 dest[j * stride + i] = clip_pixel_bd_high( |
| 1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
| 1871 } |
| 1872 } |
| 1873 |
| 1874 void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, |
| 1875 int stride, int bd) { |
| 1876 tran_low_t out[8 * 8] = { 0 }; |
| 1877 tran_low_t *outptr = out; |
| 1878 int i, j; |
| 1879 tran_low_t temp_in[8], temp_out[8]; |
| 1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 1881 |
| 1882 // First transform rows. |
| 1883 // Only first 4 row has non-zero coefs. |
| 1884 for (i = 0; i < 4; ++i) { |
| 1885 high_idct8(input, outptr, bd); |
| 1886 input += 8; |
| 1887 outptr += 8; |
| 1888 } |
| 1889 // Then transform columns. |
| 1890 for (i = 0; i < 8; ++i) { |
| 1891 for (j = 0; j < 8; ++j) |
| 1892 temp_in[j] = out[j * 8 + i]; |
| 1893 high_idct8(temp_in, temp_out, bd); |
| 1894 for (j = 0; j < 8; ++j) |
| 1895 dest[j * stride + i] = clip_pixel_bd_high( |
| 1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
| 1897 } |
| 1898 } |
| 1899 |
| 1900 static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { |
| 1901 tran_low_t step1[16], step2[16]; |
| 1902 tran_high_t temp1, temp2; |
| 1903 (void) bd; |
| 1904 |
| 1905 // stage 1 |
| 1906 step1[0] = input[0/2]; |
| 1907 step1[1] = input[16/2]; |
| 1908 step1[2] = input[8/2]; |
| 1909 step1[3] = input[24/2]; |
| 1910 step1[4] = input[4/2]; |
| 1911 step1[5] = input[20/2]; |
| 1912 step1[6] = input[12/2]; |
| 1913 step1[7] = input[28/2]; |
| 1914 step1[8] = input[2/2]; |
| 1915 step1[9] = input[18/2]; |
| 1916 step1[10] = input[10/2]; |
| 1917 step1[11] = input[26/2]; |
| 1918 step1[12] = input[6/2]; |
| 1919 step1[13] = input[22/2]; |
| 1920 step1[14] = input[14/2]; |
| 1921 step1[15] = input[30/2]; |
| 1922 |
| 1923 // stage 2 |
| 1924 step2[0] = step1[0]; |
| 1925 step2[1] = step1[1]; |
| 1926 step2[2] = step1[2]; |
| 1927 step2[3] = step1[3]; |
| 1928 step2[4] = step1[4]; |
| 1929 step2[5] = step1[5]; |
| 1930 step2[6] = step1[6]; |
| 1931 step2[7] = step1[7]; |
| 1932 |
| 1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; |
| 1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; |
| 1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1937 |
| 1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; |
| 1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; |
| 1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1942 |
| 1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; |
| 1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; |
| 1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1947 |
| 1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; |
| 1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; |
| 1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1952 |
| 1953 // stage 3 |
| 1954 step1[0] = step2[0]; |
| 1955 step1[1] = step2[1]; |
| 1956 step1[2] = step2[2]; |
| 1957 step1[3] = step2[3]; |
| 1958 |
| 1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; |
| 1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; |
| 1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; |
| 1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; |
| 1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1967 |
| 1968 step1[8] = WRAPLOW(step2[8] + step2[9]); |
| 1969 step1[9] = WRAPLOW(step2[8] - step2[9]); |
| 1970 step1[10] = WRAPLOW(-step2[10] + step2[11]); |
| 1971 step1[11] = WRAPLOW(step2[10] + step2[11]); |
| 1972 step1[12] = WRAPLOW(step2[12] + step2[13]); |
| 1973 step1[13] = WRAPLOW(step2[12] - step2[13]); |
| 1974 step1[14] = WRAPLOW(-step2[14] + step2[15]); |
| 1975 step1[15] = WRAPLOW(step2[14] + step2[15]); |
| 1976 |
| 1977 // stage 4 |
| 1978 temp1 = (step1[0] + step1[1]) * cospi_16_64; |
| 1979 temp2 = (step1[0] - step1[1]) * cospi_16_64; |
| 1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; |
| 1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; |
| 1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1986 step2[4] = WRAPLOW(step1[4] + step1[5]); |
| 1987 step2[5] = WRAPLOW(step1[4] - step1[5]); |
| 1988 step2[6] = WRAPLOW(-step1[6] + step1[7]); |
| 1989 step2[7] = WRAPLOW(step1[6] + step1[7]); |
| 1990 |
| 1991 step2[8] = step1[8]; |
| 1992 step2[15] = step1[15]; |
| 1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; |
| 1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; |
| 1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); |
| 1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); |
| 1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; |
| 1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; |
| 1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2001 step2[11] = step1[11]; |
| 2002 step2[12] = step1[12]; |
| 2003 |
| 2004 // stage 5 |
| 2005 step1[0] = WRAPLOW(step2[0] + step2[3]); |
| 2006 step1[1] = WRAPLOW(step2[1] + step2[2]); |
| 2007 step1[2] = WRAPLOW(step2[1] - step2[2]); |
| 2008 step1[3] = WRAPLOW(step2[0] - step2[3]); |
| 2009 step1[4] = step2[4]; |
| 2010 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
| 2011 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
| 2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2014 step1[7] = step2[7]; |
| 2015 |
| 2016 step1[8] = WRAPLOW(step2[8] + step2[11]); |
| 2017 step1[9] = WRAPLOW(step2[9] + step2[10]); |
| 2018 step1[10] = WRAPLOW(step2[9] - step2[10]); |
| 2019 step1[11] = WRAPLOW(step2[8] - step2[11]); |
| 2020 step1[12] = WRAPLOW(-step2[12] + step2[15]); |
| 2021 step1[13] = WRAPLOW(-step2[13] + step2[14]); |
| 2022 step1[14] = WRAPLOW(step2[13] + step2[14]); |
| 2023 step1[15] = WRAPLOW(step2[12] + step2[15]); |
| 2024 |
| 2025 // stage 6 |
| 2026 step2[0] = WRAPLOW(step1[0] + step1[7]); |
| 2027 step2[1] = WRAPLOW(step1[1] + step1[6]); |
| 2028 step2[2] = WRAPLOW(step1[2] + step1[5]); |
| 2029 step2[3] = WRAPLOW(step1[3] + step1[4]); |
| 2030 step2[4] = WRAPLOW(step1[3] - step1[4]); |
| 2031 step2[5] = WRAPLOW(step1[2] - step1[5]); |
| 2032 step2[6] = WRAPLOW(step1[1] - step1[6]); |
| 2033 step2[7] = WRAPLOW(step1[0] - step1[7]); |
| 2034 step2[8] = step1[8]; |
| 2035 step2[9] = step1[9]; |
| 2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64; |
| 2037 temp2 = (step1[10] + step1[13]) * cospi_16_64; |
| 2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64; |
| 2041 temp2 = (step1[11] + step1[12]) * cospi_16_64; |
| 2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2044 step2[14] = step1[14]; |
| 2045 step2[15] = step1[15]; |
| 2046 |
| 2047 // stage 7 |
| 2048 output[0] = WRAPLOW(step2[0] + step2[15]); |
| 2049 output[1] = WRAPLOW(step2[1] + step2[14]); |
| 2050 output[2] = WRAPLOW(step2[2] + step2[13]); |
| 2051 output[3] = WRAPLOW(step2[3] + step2[12]); |
| 2052 output[4] = WRAPLOW(step2[4] + step2[11]); |
| 2053 output[5] = WRAPLOW(step2[5] + step2[10]); |
| 2054 output[6] = WRAPLOW(step2[6] + step2[9]); |
| 2055 output[7] = WRAPLOW(step2[7] + step2[8]); |
| 2056 output[8] = WRAPLOW(step2[7] - step2[8]); |
| 2057 output[9] = WRAPLOW(step2[6] - step2[9]); |
| 2058 output[10] = WRAPLOW(step2[5] - step2[10]); |
| 2059 output[11] = WRAPLOW(step2[4] - step2[11]); |
| 2060 output[12] = WRAPLOW(step2[3] - step2[12]); |
| 2061 output[13] = WRAPLOW(step2[2] - step2[13]); |
| 2062 output[14] = WRAPLOW(step2[1] - step2[14]); |
| 2063 output[15] = WRAPLOW(step2[0] - step2[15]); |
| 2064 } |
| 2065 |
| 2066 void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2067 int stride, int bd) { |
| 2068 tran_low_t out[16 * 16]; |
| 2069 tran_low_t *outptr = out; |
| 2070 int i, j; |
| 2071 tran_low_t temp_in[16], temp_out[16]; |
| 2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2073 |
| 2074 // First transform rows. |
| 2075 for (i = 0; i < 16; ++i) { |
| 2076 high_idct16(input, outptr, bd); |
| 2077 input += 16; |
| 2078 outptr += 16; |
| 2079 } |
| 2080 |
| 2081 // Then transform columns. |
| 2082 for (i = 0; i < 16; ++i) { |
| 2083 for (j = 0; j < 16; ++j) |
| 2084 temp_in[j] = out[j * 16 + i]; |
| 2085 high_idct16(temp_in, temp_out, bd); |
| 2086 for (j = 0; j < 16; ++j) |
| 2087 dest[j * stride + i] = clip_pixel_bd_high( |
| 2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 2089 } |
| 2090 } |
| 2091 |
| 2092 static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { |
| 2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; |
| 2094 tran_high_t s9, s10, s11, s12, s13, s14, s15; |
| 2095 |
| 2096 tran_high_t x0 = input[15]; |
| 2097 tran_high_t x1 = input[0]; |
| 2098 tran_high_t x2 = input[13]; |
| 2099 tran_high_t x3 = input[2]; |
| 2100 tran_high_t x4 = input[11]; |
| 2101 tran_high_t x5 = input[4]; |
| 2102 tran_high_t x6 = input[9]; |
| 2103 tran_high_t x7 = input[6]; |
| 2104 tran_high_t x8 = input[7]; |
| 2105 tran_high_t x9 = input[8]; |
| 2106 tran_high_t x10 = input[5]; |
| 2107 tran_high_t x11 = input[10]; |
| 2108 tran_high_t x12 = input[3]; |
| 2109 tran_high_t x13 = input[12]; |
| 2110 tran_high_t x14 = input[1]; |
| 2111 tran_high_t x15 = input[14]; |
| 2112 (void) bd; |
| 2113 |
| 2114 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 |
| 2115 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { |
| 2116 vpx_memset(output, 0, 16 * sizeof(*output)); |
| 2117 return; |
| 2118 } |
| 2119 |
| 2120 // stage 1 |
| 2121 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; |
| 2122 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; |
| 2123 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; |
| 2124 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; |
| 2125 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; |
| 2126 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; |
| 2127 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; |
| 2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; |
| 2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; |
| 2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; |
| 2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; |
| 2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; |
| 2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; |
| 2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; |
| 2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; |
| 2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; |
| 2137 |
| 2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); |
| 2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); |
| 2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); |
| 2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); |
| 2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); |
| 2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); |
| 2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); |
| 2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); |
| 2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); |
| 2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); |
| 2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); |
| 2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); |
| 2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); |
| 2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); |
| 2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); |
| 2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); |
| 2154 |
| 2155 // stage 2 |
| 2156 s0 = x0; |
| 2157 s1 = x1; |
| 2158 s2 = x2; |
| 2159 s3 = x3; |
| 2160 s4 = x4; |
| 2161 s5 = x5; |
| 2162 s6 = x6; |
| 2163 s7 = x7; |
| 2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; |
| 2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; |
| 2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; |
| 2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; |
| 2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; |
| 2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; |
| 2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; |
| 2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; |
| 2172 |
| 2173 x0 = WRAPLOW(s0 + s4); |
| 2174 x1 = WRAPLOW(s1 + s5); |
| 2175 x2 = WRAPLOW(s2 + s6); |
| 2176 x3 = WRAPLOW(s3 + s7); |
| 2177 x4 = WRAPLOW(s0 - s4); |
| 2178 x5 = WRAPLOW(s1 - s5); |
| 2179 x6 = WRAPLOW(s2 - s6); |
| 2180 x7 = WRAPLOW(s3 - s7); |
| 2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); |
| 2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); |
| 2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); |
| 2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); |
| 2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); |
| 2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); |
| 2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); |
| 2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); |
| 2189 |
| 2190 // stage 3 |
| 2191 s0 = x0; |
| 2192 s1 = x1; |
| 2193 s2 = x2; |
| 2194 s3 = x3; |
| 2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; |
| 2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; |
| 2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; |
| 2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; |
| 2199 s8 = x8; |
| 2200 s9 = x9; |
| 2201 s10 = x10; |
| 2202 s11 = x11; |
| 2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; |
| 2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; |
| 2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; |
| 2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; |
| 2207 |
| 2208 x0 = WRAPLOW(s0 + s2); |
| 2209 x1 = WRAPLOW(s1 + s3); |
| 2210 x2 = WRAPLOW(s0 - s2); |
| 2211 x3 = WRAPLOW(s1 - s3); |
| 2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); |
| 2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); |
| 2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); |
| 2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); |
| 2216 x8 = WRAPLOW(s8 + s10); |
| 2217 x9 = WRAPLOW(s9 + s11); |
| 2218 x10 = WRAPLOW(s8 - s10); |
| 2219 x11 = WRAPLOW(s9 - s11); |
| 2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); |
| 2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); |
| 2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); |
| 2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); |
| 2224 |
| 2225 // stage 4 |
| 2226 s2 = (- cospi_16_64) * (x2 + x3); |
| 2227 s3 = cospi_16_64 * (x2 - x3); |
| 2228 s6 = cospi_16_64 * (x6 + x7); |
| 2229 s7 = cospi_16_64 * (-x6 + x7); |
| 2230 s10 = cospi_16_64 * (x10 + x11); |
| 2231 s11 = cospi_16_64 * (-x10 + x11); |
| 2232 s14 = (- cospi_16_64) * (x14 + x15); |
| 2233 s15 = cospi_16_64 * (x14 - x15); |
| 2234 |
| 2235 x2 = WRAPLOW(dct_const_round_shift(s2)); |
| 2236 x3 = WRAPLOW(dct_const_round_shift(s3)); |
| 2237 x6 = WRAPLOW(dct_const_round_shift(s6)); |
| 2238 x7 = WRAPLOW(dct_const_round_shift(s7)); |
| 2239 x10 = WRAPLOW(dct_const_round_shift(s10)); |
| 2240 x11 = WRAPLOW(dct_const_round_shift(s11)); |
| 2241 x14 = WRAPLOW(dct_const_round_shift(s14)); |
| 2242 x15 = WRAPLOW(dct_const_round_shift(s15)); |
| 2243 |
| 2244 output[0] = WRAPLOW(x0); |
| 2245 output[1] = WRAPLOW(-x8); |
| 2246 output[2] = WRAPLOW(x12); |
| 2247 output[3] = WRAPLOW(-x4); |
| 2248 output[4] = WRAPLOW(x6); |
| 2249 output[5] = WRAPLOW(x14); |
| 2250 output[6] = WRAPLOW(x10); |
| 2251 output[7] = WRAPLOW(x2); |
| 2252 output[8] = WRAPLOW(x3); |
| 2253 output[9] = WRAPLOW(x11); |
| 2254 output[10] = WRAPLOW(x15); |
| 2255 output[11] = WRAPLOW(x7); |
| 2256 output[12] = WRAPLOW(x5); |
| 2257 output[13] = WRAPLOW(-x13); |
| 2258 output[14] = WRAPLOW(x9); |
| 2259 output[15] = WRAPLOW(-x1); |
| 2260 } |
| 2261 |
| 2262 static const high_transform_2d HIGH_IHT_16[] = { |
| 2263 { high_idct16, high_idct16 }, // DCT_DCT = 0 |
| 2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1 |
| 2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2 |
| 2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3 |
| 2267 }; |
| 2268 |
| 2269 void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2270 int stride, int tx_type, int bd) { |
| 2271 int i, j; |
| 2272 tran_low_t out[16 * 16]; |
| 2273 tran_low_t *outptr = out; |
| 2274 tran_low_t temp_in[16], temp_out[16]; |
| 2275 const high_transform_2d ht = HIGH_IHT_16[tx_type]; |
| 2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2277 |
| 2278 // Rows |
| 2279 for (i = 0; i < 16; ++i) { |
| 2280 ht.rows(input, outptr, bd); |
| 2281 input += 16; |
| 2282 outptr += 16; |
| 2283 } |
| 2284 |
| 2285 // Columns |
| 2286 for (i = 0; i < 16; ++i) { |
| 2287 for (j = 0; j < 16; ++j) |
| 2288 temp_in[j] = out[j * 16 + i]; |
| 2289 ht.cols(temp_in, temp_out, bd); |
| 2290 for (j = 0; j < 16; ++j) |
| 2291 dest[j * stride + i] = clip_pixel_bd_high( |
| 2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 2293 } |
| 2294 } |
| 2295 |
| 2296 void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2297 int stride, int bd) { |
| 2298 tran_low_t out[16 * 16] = { 0 }; |
| 2299 tran_low_t *outptr = out; |
| 2300 int i, j; |
| 2301 tran_low_t temp_in[16], temp_out[16]; |
| 2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2303 |
| 2304 // First transform rows. Since all non-zero dct coefficients are in |
| 2305 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
| 2306 for (i = 0; i < 4; ++i) { |
| 2307 high_idct16(input, outptr, bd); |
| 2308 input += 16; |
| 2309 outptr += 16; |
| 2310 } |
| 2311 |
| 2312 // Then transform columns. |
| 2313 for (i = 0; i < 16; ++i) { |
| 2314 for (j = 0; j < 16; ++j) |
| 2315 temp_in[j] = out[j*16 + i]; |
| 2316 high_idct16(temp_in, temp_out, bd); |
| 2317 for (j = 0; j < 16; ++j) |
| 2318 dest[j * stride + i] = clip_pixel_bd_high( |
| 2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 2320 } |
| 2321 } |
| 2322 |
| 2323 void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2324 int stride, int bd) { |
| 2325 int i, j; |
| 2326 tran_high_t a1; |
| 2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); |
| 2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2329 |
| 2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); |
| 2331 a1 = ROUND_POWER_OF_TWO(out, 6); |
| 2332 for (j = 0; j < 16; ++j) { |
| 2333 for (i = 0; i < 16; ++i) |
| 2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); |
| 2335 dest += stride; |
| 2336 } |
| 2337 } |
| 2338 |
| 2339 static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { |
| 2340 tran_low_t step1[32], step2[32]; |
| 2341 tran_high_t temp1, temp2; |
| 2342 (void) bd; |
| 2343 |
| 2344 // stage 1 |
| 2345 step1[0] = input[0]; |
| 2346 step1[1] = input[16]; |
| 2347 step1[2] = input[8]; |
| 2348 step1[3] = input[24]; |
| 2349 step1[4] = input[4]; |
| 2350 step1[5] = input[20]; |
| 2351 step1[6] = input[12]; |
| 2352 step1[7] = input[28]; |
| 2353 step1[8] = input[2]; |
| 2354 step1[9] = input[18]; |
| 2355 step1[10] = input[10]; |
| 2356 step1[11] = input[26]; |
| 2357 step1[12] = input[6]; |
| 2358 step1[13] = input[22]; |
| 2359 step1[14] = input[14]; |
| 2360 step1[15] = input[30]; |
| 2361 |
| 2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; |
| 2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; |
| 2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2366 |
| 2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; |
| 2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; |
| 2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2371 |
| 2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; |
| 2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; |
| 2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2376 |
| 2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; |
| 2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; |
| 2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2381 |
| 2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; |
| 2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; |
| 2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2386 |
| 2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; |
| 2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; |
| 2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2391 |
| 2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; |
| 2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; |
| 2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2396 |
| 2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; |
| 2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; |
| 2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2401 |
| 2402 // stage 2 |
| 2403 step2[0] = step1[0]; |
| 2404 step2[1] = step1[1]; |
| 2405 step2[2] = step1[2]; |
| 2406 step2[3] = step1[3]; |
| 2407 step2[4] = step1[4]; |
| 2408 step2[5] = step1[5]; |
| 2409 step2[6] = step1[6]; |
| 2410 step2[7] = step1[7]; |
| 2411 |
| 2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; |
| 2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; |
| 2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2416 |
| 2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; |
| 2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; |
| 2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2421 |
| 2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; |
| 2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; |
| 2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2426 |
| 2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; |
| 2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; |
| 2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2431 |
| 2432 step2[16] = WRAPLOW(step1[16] + step1[17]); |
| 2433 step2[17] = WRAPLOW(step1[16] - step1[17]); |
| 2434 step2[18] = WRAPLOW(-step1[18] + step1[19]); |
| 2435 step2[19] = WRAPLOW(step1[18] + step1[19]); |
| 2436 step2[20] = WRAPLOW(step1[20] + step1[21]); |
| 2437 step2[21] = WRAPLOW(step1[20] - step1[21]); |
| 2438 step2[22] = WRAPLOW(-step1[22] + step1[23]); |
| 2439 step2[23] = WRAPLOW(step1[22] + step1[23]); |
| 2440 step2[24] = WRAPLOW(step1[24] + step1[25]); |
| 2441 step2[25] = WRAPLOW(step1[24] - step1[25]); |
| 2442 step2[26] = WRAPLOW(-step1[26] + step1[27]); |
| 2443 step2[27] = WRAPLOW(step1[26] + step1[27]); |
| 2444 step2[28] = WRAPLOW(step1[28] + step1[29]); |
| 2445 step2[29] = WRAPLOW(step1[28] - step1[29]); |
| 2446 step2[30] = WRAPLOW(-step1[30] + step1[31]); |
| 2447 step2[31] = WRAPLOW(step1[30] + step1[31]); |
| 2448 |
| 2449 // stage 3 |
| 2450 step1[0] = step2[0]; |
| 2451 step1[1] = step2[1]; |
| 2452 step1[2] = step2[2]; |
| 2453 step1[3] = step2[3]; |
| 2454 |
| 2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; |
| 2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; |
| 2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; |
| 2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; |
| 2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2463 |
| 2464 step1[8] = WRAPLOW(step2[8] + step2[9]); |
| 2465 step1[9] = WRAPLOW(step2[8] - step2[9]); |
| 2466 step1[10] = WRAPLOW(-step2[10] + step2[11]); |
| 2467 step1[11] = WRAPLOW(step2[10] + step2[11]); |
| 2468 step1[12] = WRAPLOW(step2[12] + step2[13]); |
| 2469 step1[13] = WRAPLOW(step2[12] - step2[13]); |
| 2470 step1[14] = WRAPLOW(-step2[14] + step2[15]); |
| 2471 step1[15] = WRAPLOW(step2[14] + step2[15]); |
| 2472 |
| 2473 step1[16] = step2[16]; |
| 2474 step1[31] = step2[31]; |
| 2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; |
| 2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; |
| 2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; |
| 2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; |
| 2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2483 step1[19] = step2[19]; |
| 2484 step1[20] = step2[20]; |
| 2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; |
| 2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; |
| 2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; |
| 2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; |
| 2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2493 step1[23] = step2[23]; |
| 2494 step1[24] = step2[24]; |
| 2495 step1[27] = step2[27]; |
| 2496 step1[28] = step2[28]; |
| 2497 |
| 2498 // stage 4 |
| 2499 temp1 = (step1[0] + step1[1]) * cospi_16_64; |
| 2500 temp2 = (step1[0] - step1[1]) * cospi_16_64; |
| 2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; |
| 2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; |
| 2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2507 step2[4] = WRAPLOW(step1[4] + step1[5]); |
| 2508 step2[5] = WRAPLOW(step1[4] - step1[5]); |
| 2509 step2[6] = WRAPLOW(-step1[6] + step1[7]); |
| 2510 step2[7] = WRAPLOW(step1[6] + step1[7]); |
| 2511 |
| 2512 step2[8] = step1[8]; |
| 2513 step2[15] = step1[15]; |
| 2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; |
| 2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; |
| 2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; |
| 2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; |
| 2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2522 step2[11] = step1[11]; |
| 2523 step2[12] = step1[12]; |
| 2524 |
| 2525 step2[16] = WRAPLOW(step1[16] + step1[19]); |
| 2526 step2[17] = WRAPLOW(step1[17] + step1[18]); |
| 2527 step2[18] = WRAPLOW(step1[17] - step1[18]); |
| 2528 step2[19] = WRAPLOW(step1[16] - step1[19]); |
| 2529 step2[20] = WRAPLOW(-step1[20] + step1[23]); |
| 2530 step2[21] = WRAPLOW(-step1[21] + step1[22]); |
| 2531 step2[22] = WRAPLOW(step1[21] + step1[22]); |
| 2532 step2[23] = WRAPLOW(step1[20] + step1[23]); |
| 2533 |
| 2534 step2[24] = WRAPLOW(step1[24] + step1[27]); |
| 2535 step2[25] = WRAPLOW(step1[25] + step1[26]); |
| 2536 step2[26] = WRAPLOW(step1[25] - step1[26]); |
| 2537 step2[27] = WRAPLOW(step1[24] - step1[27]); |
| 2538 step2[28] = WRAPLOW(-step1[28] + step1[31]); |
| 2539 step2[29] = WRAPLOW(-step1[29] + step1[30]); |
| 2540 step2[30] = WRAPLOW(step1[29] + step1[30]); |
| 2541 step2[31] = WRAPLOW(step1[28] + step1[31]); |
| 2542 |
| 2543 // stage 5 |
| 2544 step1[0] = WRAPLOW(step2[0] + step2[3]); |
| 2545 step1[1] = WRAPLOW(step2[1] + step2[2]); |
| 2546 step1[2] = WRAPLOW(step2[1] - step2[2]); |
| 2547 step1[3] = WRAPLOW(step2[0] - step2[3]); |
| 2548 step1[4] = step2[4]; |
| 2549 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
| 2550 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
| 2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2553 step1[7] = step2[7]; |
| 2554 |
| 2555 step1[8] = WRAPLOW(step2[8] + step2[11]); |
| 2556 step1[9] = WRAPLOW(step2[9] + step2[10]); |
| 2557 step1[10] = WRAPLOW(step2[9] - step2[10]); |
| 2558 step1[11] = WRAPLOW(step2[8] - step2[11]); |
| 2559 step1[12] = WRAPLOW(-step2[12] + step2[15]); |
| 2560 step1[13] = WRAPLOW(-step2[13] + step2[14]); |
| 2561 step1[14] = WRAPLOW(step2[13] + step2[14]); |
| 2562 step1[15] = WRAPLOW(step2[12] + step2[15]); |
| 2563 |
| 2564 step1[16] = step2[16]; |
| 2565 step1[17] = step2[17]; |
| 2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; |
| 2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; |
| 2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; |
| 2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; |
| 2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; |
| 2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; |
| 2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; |
| 2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; |
| 2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2582 step1[22] = step2[22]; |
| 2583 step1[23] = step2[23]; |
| 2584 step1[24] = step2[24]; |
| 2585 step1[25] = step2[25]; |
| 2586 step1[30] = step2[30]; |
| 2587 step1[31] = step2[31]; |
| 2588 |
| 2589 // stage 6 |
| 2590 step2[0] = WRAPLOW(step1[0] + step1[7]); |
| 2591 step2[1] = WRAPLOW(step1[1] + step1[6]); |
| 2592 step2[2] = WRAPLOW(step1[2] + step1[5]); |
| 2593 step2[3] = WRAPLOW(step1[3] + step1[4]); |
| 2594 step2[4] = WRAPLOW(step1[3] - step1[4]); |
| 2595 step2[5] = WRAPLOW(step1[2] - step1[5]); |
| 2596 step2[6] = WRAPLOW(step1[1] - step1[6]); |
| 2597 step2[7] = WRAPLOW(step1[0] - step1[7]); |
| 2598 step2[8] = step1[8]; |
| 2599 step2[9] = step1[9]; |
| 2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64; |
| 2601 temp2 = (step1[10] + step1[13]) * cospi_16_64; |
| 2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64; |
| 2605 temp2 = (step1[11] + step1[12]) * cospi_16_64; |
| 2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2608 step2[14] = WRAPLOW(step1[14]); |
| 2609 step2[15] = WRAPLOW(step1[15]); |
| 2610 |
| 2611 step2[16] = WRAPLOW(step1[16] + step1[23]); |
| 2612 step2[17] = WRAPLOW(step1[17] + step1[22]); |
| 2613 step2[18] = WRAPLOW(step1[18] + step1[21]); |
| 2614 step2[19] = WRAPLOW(step1[19] + step1[20]); |
| 2615 step2[20] = WRAPLOW(step1[19] - step1[20]); |
| 2616 step2[21] = WRAPLOW(step1[18] - step1[21]); |
| 2617 step2[22] = WRAPLOW(step1[17] - step1[22]); |
| 2618 step2[23] = WRAPLOW(step1[16] - step1[23]); |
| 2619 |
| 2620 step2[24] = WRAPLOW(-step1[24] + step1[31]); |
| 2621 step2[25] = WRAPLOW(-step1[25] + step1[30]); |
| 2622 step2[26] = WRAPLOW(-step1[26] + step1[29]); |
| 2623 step2[27] = WRAPLOW(-step1[27] + step1[28]); |
| 2624 step2[28] = WRAPLOW(step1[27] + step1[28]); |
| 2625 step2[29] = WRAPLOW(step1[26] + step1[29]); |
| 2626 step2[30] = WRAPLOW(step1[25] + step1[30]); |
| 2627 step2[31] = WRAPLOW(step1[24] + step1[31]); |
| 2628 |
| 2629 // stage 7 |
| 2630 step1[0] = WRAPLOW(step2[0] + step2[15]); |
| 2631 step1[1] = WRAPLOW(step2[1] + step2[14]); |
| 2632 step1[2] = WRAPLOW(step2[2] + step2[13]); |
| 2633 step1[3] = WRAPLOW(step2[3] + step2[12]); |
| 2634 step1[4] = WRAPLOW(step2[4] + step2[11]); |
| 2635 step1[5] = WRAPLOW(step2[5] + step2[10]); |
| 2636 step1[6] = WRAPLOW(step2[6] + step2[9]); |
| 2637 step1[7] = WRAPLOW(step2[7] + step2[8]); |
| 2638 step1[8] = WRAPLOW(step2[7] - step2[8]); |
| 2639 step1[9] = WRAPLOW(step2[6] - step2[9]); |
| 2640 step1[10] = WRAPLOW(step2[5] - step2[10]); |
| 2641 step1[11] = WRAPLOW(step2[4] - step2[11]); |
| 2642 step1[12] = WRAPLOW(step2[3] - step2[12]); |
| 2643 step1[13] = WRAPLOW(step2[2] - step2[13]); |
| 2644 step1[14] = WRAPLOW(step2[1] - step2[14]); |
| 2645 step1[15] = WRAPLOW(step2[0] - step2[15]); |
| 2646 |
| 2647 step1[16] = step2[16]; |
| 2648 step1[17] = step2[17]; |
| 2649 step1[18] = step2[18]; |
| 2650 step1[19] = step2[19]; |
| 2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64; |
| 2652 temp2 = (step2[20] + step2[27]) * cospi_16_64; |
| 2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64; |
| 2656 temp2 = (step2[21] + step2[26]) * cospi_16_64; |
| 2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64; |
| 2660 temp2 = (step2[22] + step2[25]) * cospi_16_64; |
| 2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64; |
| 2664 temp2 = (step2[23] + step2[24]) * cospi_16_64; |
| 2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); |
| 2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); |
| 2667 step1[28] = step2[28]; |
| 2668 step1[29] = step2[29]; |
| 2669 step1[30] = step2[30]; |
| 2670 step1[31] = step2[31]; |
| 2671 |
| 2672 // final stage |
| 2673 output[0] = WRAPLOW(step1[0] + step1[31]); |
| 2674 output[1] = WRAPLOW(step1[1] + step1[30]); |
| 2675 output[2] = WRAPLOW(step1[2] + step1[29]); |
| 2676 output[3] = WRAPLOW(step1[3] + step1[28]); |
| 2677 output[4] = WRAPLOW(step1[4] + step1[27]); |
| 2678 output[5] = WRAPLOW(step1[5] + step1[26]); |
| 2679 output[6] = WRAPLOW(step1[6] + step1[25]); |
| 2680 output[7] = WRAPLOW(step1[7] + step1[24]); |
| 2681 output[8] = WRAPLOW(step1[8] + step1[23]); |
| 2682 output[9] = WRAPLOW(step1[9] + step1[22]); |
| 2683 output[10] = WRAPLOW(step1[10] + step1[21]); |
| 2684 output[11] = WRAPLOW(step1[11] + step1[20]); |
| 2685 output[12] = WRAPLOW(step1[12] + step1[19]); |
| 2686 output[13] = WRAPLOW(step1[13] + step1[18]); |
| 2687 output[14] = WRAPLOW(step1[14] + step1[17]); |
| 2688 output[15] = WRAPLOW(step1[15] + step1[16]); |
| 2689 output[16] = WRAPLOW(step1[15] - step1[16]); |
| 2690 output[17] = WRAPLOW(step1[14] - step1[17]); |
| 2691 output[18] = WRAPLOW(step1[13] - step1[18]); |
| 2692 output[19] = WRAPLOW(step1[12] - step1[19]); |
| 2693 output[20] = WRAPLOW(step1[11] - step1[20]); |
| 2694 output[21] = WRAPLOW(step1[10] - step1[21]); |
| 2695 output[22] = WRAPLOW(step1[9] - step1[22]); |
| 2696 output[23] = WRAPLOW(step1[8] - step1[23]); |
| 2697 output[24] = WRAPLOW(step1[7] - step1[24]); |
| 2698 output[25] = WRAPLOW(step1[6] - step1[25]); |
| 2699 output[26] = WRAPLOW(step1[5] - step1[26]); |
| 2700 output[27] = WRAPLOW(step1[4] - step1[27]); |
| 2701 output[28] = WRAPLOW(step1[3] - step1[28]); |
| 2702 output[29] = WRAPLOW(step1[2] - step1[29]); |
| 2703 output[30] = WRAPLOW(step1[1] - step1[30]); |
| 2704 output[31] = WRAPLOW(step1[0] - step1[31]); |
| 2705 } |
| 2706 |
| 2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2708 int stride, int bd) { |
| 2709 tran_low_t out[32 * 32]; |
| 2710 tran_low_t *outptr = out; |
| 2711 int i, j; |
| 2712 tran_low_t temp_in[32], temp_out[32]; |
| 2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2714 |
| 2715 // Rows |
| 2716 for (i = 0; i < 32; ++i) { |
| 2717 tran_low_t zero_coeff[16]; |
| 2718 for (j = 0; j < 16; ++j) |
| 2719 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; |
| 2720 for (j = 0; j < 8; ++j) |
| 2721 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
| 2722 for (j = 0; j < 4; ++j) |
| 2723 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
| 2724 for (j = 0; j < 2; ++j) |
| 2725 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
| 2726 |
| 2727 if (zero_coeff[0] | zero_coeff[1]) |
| 2728 high_idct32(input, outptr, bd); |
| 2729 else |
| 2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); |
| 2731 input += 32; |
| 2732 outptr += 32; |
| 2733 } |
| 2734 |
| 2735 // Columns |
| 2736 for (i = 0; i < 32; ++i) { |
| 2737 for (j = 0; j < 32; ++j) |
| 2738 temp_in[j] = out[j * 32 + i]; |
| 2739 high_idct32(temp_in, temp_out, bd); |
| 2740 for (j = 0; j < 32; ++j) |
| 2741 dest[j * stride + i] = clip_pixel_bd_high( |
| 2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 2743 } |
| 2744 } |
| 2745 |
| 2746 void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2747 int stride, int bd) { |
| 2748 tran_low_t out[32 * 32] = {0}; |
| 2749 tran_low_t *outptr = out; |
| 2750 int i, j; |
| 2751 tran_low_t temp_in[32], temp_out[32]; |
| 2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2753 |
| 2754 // Rows |
| 2755 // Only upper-left 8x8 has non-zero coeff. |
| 2756 for (i = 0; i < 8; ++i) { |
| 2757 high_idct32(input, outptr, bd); |
| 2758 input += 32; |
| 2759 outptr += 32; |
| 2760 } |
| 2761 // Columns |
| 2762 for (i = 0; i < 32; ++i) { |
| 2763 for (j = 0; j < 32; ++j) |
| 2764 temp_in[j] = out[j * 32 + i]; |
| 2765 high_idct32(temp_in, temp_out, bd); |
| 2766 for (j = 0; j < 32; ++j) |
| 2767 dest[j * stride + i] = clip_pixel_bd_high( |
| 2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 2769 } |
| 2770 } |
| 2771 |
| 2772 void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, |
| 2773 int stride, int bd) { |
| 2774 int i, j; |
| 2775 int a1; |
| 2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 2777 |
| 2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); |
| 2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); |
| 2780 a1 = ROUND_POWER_OF_TWO(out, 6); |
| 2781 |
| 2782 for (j = 0; j < 32; ++j) { |
| 2783 for (i = 0; i < 32; ++i) |
| 2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); |
| 2785 dest += stride; |
| 2786 } |
| 2787 } |
| 2788 |
| 2789 // idct |
| 2790 void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 2791 int eob, int bd) { |
| 2792 if (eob > 1) |
| 2793 vp9_high_idct4x4_16_add(input, dest, stride, bd); |
| 2794 else |
| 2795 vp9_high_idct4x4_1_add(input, dest, stride, bd); |
| 2796 } |
| 2797 |
| 2798 |
| 2799 void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 2800 int eob, int bd) { |
| 2801 if (eob > 1) |
| 2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd); |
| 2803 else |
| 2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd); |
| 2805 } |
| 2806 |
| 2807 void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 2808 int eob, int bd) { |
| 2809 // If dc is 1, then input[0] is the reconstructed value, do not need |
| 2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. |
| 2811 |
| 2812 // The calculation can be simplified if there are not many non-zero dct |
| 2813 // coefficients. Use eobs to decide what to do. |
| 2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. |
| 2815 // Combine that with code here. |
| 2816 // DC only DCT coefficient |
| 2817 if (eob == 1) { |
| 2818 vp9_high_idct8x8_1_add(input, dest, stride, bd); |
| 2819 } else if (eob <= 10) { |
| 2820 vp9_high_idct8x8_10_add(input, dest, stride, bd); |
| 2821 } else { |
| 2822 vp9_high_idct8x8_64_add(input, dest, stride, bd); |
| 2823 } |
| 2824 } |
| 2825 |
| 2826 void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 2827 int eob, int bd) { |
| 2828 // The calculation can be simplified if there are not many non-zero dct |
| 2829 // coefficients. Use eobs to separate different cases. |
| 2830 // DC only DCT coefficient. |
| 2831 if (eob == 1) { |
| 2832 vp9_high_idct16x16_1_add(input, dest, stride, bd); |
| 2833 } else if (eob <= 10) { |
| 2834 vp9_high_idct16x16_10_add(input, dest, stride, bd); |
| 2835 } else { |
| 2836 vp9_high_idct16x16_256_add(input, dest, stride, bd); |
| 2837 } |
| 2838 } |
| 2839 |
| 2840 void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, |
| 2841 int eob, int bd) { |
| 2842 // Non-zero coeff only in upper-left 8x8 |
| 2843 if (eob == 1) { |
| 2844 vp9_high_idct32x32_1_add(input, dest, stride, bd); |
| 2845 } else if (eob <= 34) { |
| 2846 vp9_high_idct32x32_34_add(input, dest, stride, bd); |
| 2847 } else { |
| 2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd); |
| 2849 } |
| 2850 } |
| 2851 |
| 2852 // iht |
| 2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, |
| 2854 uint8_t *dest, int stride, int eob, int bd) { |
| 2855 if (tx_type == DCT_DCT) |
| 2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd); |
| 2857 else |
| 2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd); |
| 2859 } |
| 2860 |
| 2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, |
| 2862 uint8_t *dest, int stride, int eob, int bd) { |
| 2863 if (tx_type == DCT_DCT) { |
| 2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd); |
| 2865 } else { |
| 2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd); |
| 2867 } |
| 2868 } |
| 2869 |
| 2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, |
| 2871 uint8_t *dest, int stride, int eob, int bd) { |
| 2872 if (tx_type == DCT_DCT) { |
| 2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd); |
| 2874 } else { |
| 2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd); |
| 2876 } |
| 2877 } |
| 2878 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |