OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 | 11 |
12 /**************************************************************************** | 12 /**************************************************************************** |
13 * Notes: | 13 * Notes: |
14 * | 14 * |
15 * This implementation makes use of 16 bit fixed point verio of two multiply | 15 * This implementation makes use of 16 bit fixed point verio of two multiply |
16 * constants: | 16 * constants: |
17 * 1. sqrt(2) * cos (pi/8) | 17 * 1. sqrt(2) * cos (pi/8) |
18 * 2. sqrt(2) * sin (pi/8) | 18 * 2. sqrt(2) * sin (pi/8) |
19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit | 19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit |
20 * fixed point precision as the second one, we use a trick of | 20 * fixed point precision as the second one, we use a trick of |
21 * x * a = x + x*(a-1) | 21 * x * a = x + x*(a-1) |
22 * so | 22 * so |
23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). | 23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). |
24 **************************************************************************/ | 24 **************************************************************************/ |
25 #include <assert.h> | 25 #include <assert.h> |
26 #include <math.h> | 26 #include <math.h> |
27 #include "vpx_ports/config.h" | 27 #include "./vpx_config.h" |
28 #include "vp9/common/vp9_systemdependent.h" | 28 #include "vp9/common/vp9_systemdependent.h" |
29 | |
30 #include "vp9/common/vp9_blockd.h" | 29 #include "vp9/common/vp9_blockd.h" |
| 30 #include "vp9/common/vp9_common.h" |
31 | 31 |
32 static const int cospi8sqrt2minus1 = 20091; | 32 static const int cospi8sqrt2minus1 = 20091; |
33 static const int sinpi8sqrt2 = 35468; | 33 static const int sinpi8sqrt2 = 35468; |
34 static const int rounding = 0; | 34 static const int rounding = 0; |
35 | 35 |
36 static const int16_t idct_i4[16] = { | 36 static const int16_t idct_i4[16] = { |
37 8192, 10703, 8192, 4433, | 37 8192, 10703, 8192, 4433, |
38 8192, 4433, -8192, -10703, | 38 8192, 4433, -8192, -10703, |
39 8192, -4433, -8192, 10703, | 39 8192, -4433, -8192, 10703, |
40 8192, -10703, 8192, -4433 | 40 8192, -10703, 8192, -4433 |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
152 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, | 152 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, |
153 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, | 153 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, |
154 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, | 154 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, |
155 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, | 155 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, |
156 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, | 156 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, |
157 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 | 157 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 |
158 }; | 158 }; |
159 | 159 |
160 | 160 |
161 /* Converted the transforms to integer form. */ | 161 /* Converted the transforms to integer form. */ |
162 #define VERTICAL_SHIFT 14 // 16 | 162 #define HORIZONTAL_SHIFT 14 // 16 |
| 163 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) |
| 164 #define VERTICAL_SHIFT 17 // 15 |
163 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) | 165 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) |
164 #define HORIZONTAL_SHIFT 17 // 15 | |
165 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) | |
166 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, | 166 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, |
167 TX_TYPE tx_type, int tx_dim, uint16_t eobs) { | 167 TX_TYPE tx_type, int tx_dim, uint16_t eobs) { |
168 int i, j, k; | 168 int i, j, k; |
169 int nz_dim; | 169 int nz_dim; |
170 int16_t imbuf[256]; | 170 int16_t imbuf[256]; |
171 | 171 |
172 const int16_t *ip = input; | 172 const int16_t *ip = input; |
173 int16_t *op = output; | 173 int16_t *op = output; |
174 int16_t *im = &imbuf[0]; | 174 int16_t *im = &imbuf[0]; |
175 | 175 |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
211 vpx_memset(im, 0, 512); | 211 vpx_memset(im, 0, 512); |
212 nz_dim = 8; | 212 nz_dim = 8; |
213 if(eobs < 3) { | 213 if(eobs < 3) { |
214 nz_dim = 2; | 214 nz_dim = 2; |
215 } else if(eobs < 10) { | 215 } else if(eobs < 10) { |
216 nz_dim = 4; | 216 nz_dim = 4; |
217 } | 217 } |
218 } | 218 } |
219 } | 219 } |
220 | 220 |
221 /* vertical transformation */ | 221 /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps |
| 222 * from right to left: |
| 223 * 1. horizontal transform: Y= Z*Transposed_M2 |
| 224 * 2. vertical transform: X = M1*Y |
| 225 * In SIMD, doing this way could eliminate the transpose needed if it is |
| 226 * calculated from left to right. |
| 227 */ |
| 228 /* Horizontal transformation */ |
222 for (j = 0; j < tx_dim; j++) { | 229 for (j = 0; j < tx_dim; j++) { |
223 for (i = 0; i < nz_dim; i++) { | 230 for (i = 0; i < nz_dim; i++) { |
224 int temp = 0; | 231 int temp = 0; |
225 | 232 |
226 for (k = 0; k < nz_dim; k++) { | 233 for (k = 0; k < nz_dim; k++) { |
227 temp += ptv[k] * ip[(k * tx_dim)]; | 234 temp += ip[k] * pth[k]; |
228 } | 235 } |
229 | 236 |
230 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); | 237 /* Calculate im and store it in its transposed position. */ |
231 ip++; | 238 im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); |
| 239 ip += tx_dim; |
232 } | 240 } |
233 im += tx_dim; // 16 | 241 im += tx_dim; |
234 ptv += tx_dim; | 242 pth += tx_dim; |
235 ip = input; | 243 ip = input; |
236 } | 244 } |
237 | 245 |
238 /* horizontal transformation */ | 246 /* Vertical transformation */ |
239 im = &imbuf[0]; | 247 im = &imbuf[0]; |
240 | 248 |
241 for (j = 0; j < tx_dim; j++) { | 249 for (i = 0; i < tx_dim; i++) { |
242 const int16_t *pthc = pth; | 250 for (j = 0; j < tx_dim; j++) { |
243 | |
244 for (i = 0; i < tx_dim; i++) { | |
245 int temp = 0; | 251 int temp = 0; |
246 | 252 |
247 for (k = 0; k < nz_dim; k++) { | 253 for (k = 0; k < nz_dim; k++) { |
248 temp += im[k] * pthc[k]; | 254 temp += ptv[k] * im[k]; |
249 } | 255 } |
250 | 256 |
251 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); | 257 op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); |
252 pthc += tx_dim; | 258 im += tx_dim; |
253 } | 259 } |
254 | 260 im = &imbuf[0]; |
255 im += tx_dim; // 16 | 261 ptv += tx_dim; |
256 op += shortpitch; | 262 op += shortpitch; |
257 } | 263 } |
258 } | 264 } |
259 | 265 |
260 void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) { | 266 void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { |
261 int i; | 267 int i; |
262 int a1, b1, c1, d1; | 268 int a1, b1, c1, d1; |
263 | 269 |
264 short *ip = input; | 270 int16_t *ip = input; |
265 short *op = output; | 271 int16_t *op = output; |
266 int temp1, temp2; | 272 int temp1, temp2; |
267 int shortpitch = pitch >> 1; | 273 int shortpitch = pitch >> 1; |
268 | 274 |
269 for (i = 0; i < 4; i++) { | 275 for (i = 0; i < 4; i++) { |
270 a1 = ip[0] + ip[8]; | 276 a1 = ip[0] + ip[8]; |
271 b1 = ip[0] - ip[8]; | 277 b1 = ip[0] - ip[8]; |
272 | 278 |
273 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; | 279 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; |
274 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); | 280 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); |
275 c1 = temp1 - temp2; | 281 c1 = temp1 - temp2; |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
307 op[3] = (a1 - d1 + 16) >> 5; | 313 op[3] = (a1 - d1 + 16) >> 5; |
308 | 314 |
309 op[1] = (b1 + c1 + 16) >> 5; | 315 op[1] = (b1 + c1 + 16) >> 5; |
310 op[2] = (b1 - c1 + 16) >> 5; | 316 op[2] = (b1 - c1 + 16) >> 5; |
311 | 317 |
312 ip += shortpitch; | 318 ip += shortpitch; |
313 op += shortpitch; | 319 op += shortpitch; |
314 } | 320 } |
315 } | 321 } |
316 | 322 |
317 void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) { | 323 void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) { |
318 int i; | 324 int i; |
319 int a1; | 325 int a1; |
320 short *op = output; | 326 int16_t *op = output; |
321 int shortpitch = pitch >> 1; | 327 int shortpitch = pitch >> 1; |
322 a1 = ((input[0] + 16) >> 5); | 328 a1 = ((input[0] + 16) >> 5); |
323 for (i = 0; i < 4; i++) { | 329 for (i = 0; i < 4; i++) { |
324 op[0] = a1; | 330 op[0] = a1; |
325 op[1] = a1; | 331 op[1] = a1; |
326 op[2] = a1; | 332 op[2] = a1; |
327 op[3] = a1; | 333 op[3] = a1; |
328 op += shortpitch; | 334 op += shortpitch; |
329 } | 335 } |
330 } | 336 } |
331 | 337 |
332 void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, | 338 void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, |
333 unsigned char *dst_ptr, int pitch, int stride) { | 339 uint8_t *dst_ptr, int pitch, int stride) { |
334 int a1 = ((input_dc + 16) >> 5); | 340 int a1 = ((input_dc + 16) >> 5); |
335 int r, c; | 341 int r, c; |
336 | 342 |
337 for (r = 0; r < 4; r++) { | 343 for (r = 0; r < 4; r++) { |
338 for (c = 0; c < 4; c++) { | 344 for (c = 0; c < 4; c++) { |
339 int a = a1 + pred_ptr[c]; | 345 dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); |
340 | |
341 if (a < 0) | |
342 a = 0; | |
343 | |
344 if (a > 255) | |
345 a = 255; | |
346 | |
347 dst_ptr[c] = (unsigned char) a; | |
348 } | 346 } |
349 | 347 |
350 dst_ptr += stride; | 348 dst_ptr += stride; |
351 pred_ptr += pitch; | 349 pred_ptr += pitch; |
352 } | 350 } |
353 } | 351 } |
354 | 352 |
355 void vp9_short_inv_walsh4x4_c(short *input, short *output) { | 353 void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) { |
356 int i; | 354 int i; |
357 int a1, b1, c1, d1; | 355 int a1, b1, c1, d1; |
358 short *ip = input; | 356 int16_t *ip = input; |
359 short *op = output; | 357 int16_t *op = output; |
360 | 358 |
361 for (i = 0; i < 4; i++) { | 359 for (i = 0; i < 4; i++) { |
362 a1 = ((ip[0] + ip[3])); | 360 a1 = ((ip[0] + ip[3])); |
363 b1 = ((ip[1] + ip[2])); | 361 b1 = ((ip[1] + ip[2])); |
364 c1 = ((ip[1] - ip[2])); | 362 c1 = ((ip[1] - ip[2])); |
365 d1 = ((ip[0] - ip[3])); | 363 d1 = ((ip[0] - ip[3])); |
366 | 364 |
367 op[0] = (a1 + b1 + 1) >> 1; | 365 op[0] = (a1 + b1 + 1) >> 1; |
368 op[1] = (c1 + d1) >> 1; | 366 op[1] = (c1 + d1) >> 1; |
369 op[2] = (a1 - b1) >> 1; | 367 op[2] = (a1 - b1) >> 1; |
(...skipping 12 matching lines...) Expand all Loading... |
382 d1 = ip[0] - ip[12]; | 380 d1 = ip[0] - ip[12]; |
383 op[0] = (a1 + b1 + 1) >> 1; | 381 op[0] = (a1 + b1 + 1) >> 1; |
384 op[4] = (c1 + d1) >> 1; | 382 op[4] = (c1 + d1) >> 1; |
385 op[8] = (a1 - b1) >> 1; | 383 op[8] = (a1 - b1) >> 1; |
386 op[12] = (d1 - c1) >> 1; | 384 op[12] = (d1 - c1) >> 1; |
387 ip++; | 385 ip++; |
388 op++; | 386 op++; |
389 } | 387 } |
390 } | 388 } |
391 | 389 |
392 void vp9_short_inv_walsh4x4_1_c(short *in, short *out) { | 390 void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) { |
393 int i; | 391 int i; |
394 short tmp[4]; | 392 int16_t tmp[4]; |
395 short *ip = in; | 393 int16_t *ip = in; |
396 short *op = tmp; | 394 int16_t *op = tmp; |
397 | 395 |
398 op[0] = (ip[0] + 1) >> 1; | 396 op[0] = (ip[0] + 1) >> 1; |
399 op[1] = op[2] = op[3] = (ip[0] >> 1); | 397 op[1] = op[2] = op[3] = (ip[0] >> 1); |
400 | 398 |
401 ip = tmp; | 399 ip = tmp; |
402 op = out; | 400 op = out; |
403 for (i = 0; i < 4; i++) { | 401 for (i = 0; i < 4; i++) { |
404 op[0] = (ip[0] + 1) >> 1; | 402 op[0] = (ip[0] + 1) >> 1; |
405 op[4] = op[8] = op[12] = (ip[0] >> 1); | 403 op[4] = op[8] = op[12] = (ip[0] >> 1); |
406 ip++; | 404 ip++; |
407 op++; | 405 op++; |
408 } | 406 } |
409 } | 407 } |
410 | 408 |
411 #if CONFIG_LOSSLESS | 409 #if CONFIG_LOSSLESS |
412 void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) { | 410 void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) { |
413 int i; | 411 int i; |
414 int a1, b1, c1, d1; | 412 int a1, b1, c1, d1; |
415 short *ip = input; | 413 int16_t *ip = input; |
416 short *op = output; | 414 int16_t *op = output; |
417 | 415 |
418 for (i = 0; i < 4; i++) { | 416 for (i = 0; i < 4; i++) { |
419 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; | 417 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; |
420 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; | 418 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; |
421 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; | 419 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; |
422 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; | 420 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; |
423 | 421 |
424 op[0] = (a1 + b1 + 1) >> 1; | 422 op[0] = (a1 + b1 + 1) >> 1; |
425 op[1] = (c1 + d1) >> 1; | 423 op[1] = (c1 + d1) >> 1; |
426 op[2] = (a1 - b1) >> 1; | 424 op[2] = (a1 - b1) >> 1; |
(...skipping 15 matching lines...) Expand all Loading... |
442 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; | 440 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; |
443 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; | 441 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; |
444 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; | 442 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; |
445 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; | 443 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; |
446 | 444 |
447 ip++; | 445 ip++; |
448 op++; | 446 op++; |
449 } | 447 } |
450 } | 448 } |
451 | 449 |
452 void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) { | 450 void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) { |
453 int i; | 451 int i; |
454 short tmp[4]; | 452 int16_t tmp[4]; |
455 short *ip = in; | 453 int16_t *ip = in; |
456 short *op = tmp; | 454 int16_t *op = tmp; |
457 | 455 |
458 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; | 456 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; |
459 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); | 457 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); |
460 | 458 |
461 ip = tmp; | 459 ip = tmp; |
462 op = out; | 460 op = out; |
463 for (i = 0; i < 4; i++) { | 461 for (i = 0; i < 4; i++) { |
464 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; | 462 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; |
465 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; | 463 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; |
466 ip++; | 464 ip++; |
467 op++; | 465 op++; |
468 } | 466 } |
469 } | 467 } |
470 | 468 |
471 void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) { | 469 void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) { |
472 int i; | 470 int i; |
473 int a1, b1, c1, d1; | 471 int a1, b1, c1, d1; |
474 short *ip = input; | 472 int16_t *ip = input; |
475 short *op = output; | 473 int16_t *op = output; |
476 int shortpitch = pitch >> 1; | 474 int shortpitch = pitch >> 1; |
477 | 475 |
478 for (i = 0; i < 4; i++) { | 476 for (i = 0; i < 4; i++) { |
479 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; | 477 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; |
480 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; | 478 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; |
481 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; | 479 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; |
482 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; | 480 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; |
483 | 481 |
484 op[0] = (a1 + b1 + 1) >> 1; | 482 op[0] = (a1 + b1 + 1) >> 1; |
485 op[1] = (c1 + d1) >> 1; | 483 op[1] = (c1 + d1) >> 1; |
(...skipping 16 matching lines...) Expand all Loading... |
502 op[shortpitch * 0] = (a1 + b1 + 1) >> 1; | 500 op[shortpitch * 0] = (a1 + b1 + 1) >> 1; |
503 op[shortpitch * 1] = (c1 + d1) >> 1; | 501 op[shortpitch * 1] = (c1 + d1) >> 1; |
504 op[shortpitch * 2] = (a1 - b1) >> 1; | 502 op[shortpitch * 2] = (a1 - b1) >> 1; |
505 op[shortpitch * 3] = (d1 - c1) >> 1; | 503 op[shortpitch * 3] = (d1 - c1) >> 1; |
506 | 504 |
507 ip++; | 505 ip++; |
508 op++; | 506 op++; |
509 } | 507 } |
510 } | 508 } |
511 | 509 |
512 void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) { | 510 void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) { |
513 int i; | 511 int i; |
514 short tmp[4]; | 512 int16_t tmp[4]; |
515 short *ip = in; | 513 int16_t *ip = in; |
516 short *op = tmp; | 514 int16_t *op = tmp; |
517 int shortpitch = pitch >> 1; | 515 int shortpitch = pitch >> 1; |
518 | 516 |
519 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; | 517 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; |
520 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); | 518 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); |
521 | 519 |
522 | 520 |
523 ip = tmp; | 521 ip = tmp; |
524 op = out; | 522 op = out; |
525 for (i = 0; i < 4; i++) { | 523 for (i = 0; i < 4; i++) { |
526 op[shortpitch * 0] = (ip[0] + 1) >> 1; | 524 op[shortpitch * 0] = (ip[0] + 1) >> 1; |
527 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; | 525 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; |
528 ip++; | 526 ip++; |
529 op++; | 527 op++; |
530 } | 528 } |
531 } | 529 } |
532 | 530 |
533 void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr, | 531 void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr, |
534 unsigned char *dst_ptr, | 532 uint8_t *dst_ptr, |
535 int pitch, int stride) { | 533 int pitch, int stride) { |
536 int r, c; | 534 int r, c; |
537 short tmp[16]; | 535 short tmp[16]; |
538 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); | 536 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); |
539 | 537 |
540 for (r = 0; r < 4; r++) { | 538 for (r = 0; r < 4; r++) { |
541 for (c = 0; c < 4; c++) { | 539 for (c = 0; c < 4; c++) { |
542 int a = tmp[r * 4 + c] + pred_ptr[c]; | 540 dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]); |
543 if (a < 0) | |
544 a = 0; | |
545 | |
546 if (a > 255) | |
547 a = 255; | |
548 | |
549 dst_ptr[c] = (unsigned char) a; | |
550 } | 541 } |
551 | 542 |
552 dst_ptr += stride; | 543 dst_ptr += stride; |
553 pred_ptr += pitch; | 544 pred_ptr += pitch; |
554 } | 545 } |
555 } | 546 } |
556 #endif | 547 #endif |
557 | 548 |
558 void vp9_dc_only_idct_add_8x8_c(short input_dc, | 549 void vp9_dc_only_idct_add_8x8_c(short input_dc, |
559 unsigned char *pred_ptr, | 550 uint8_t *pred_ptr, |
560 unsigned char *dst_ptr, | 551 uint8_t *dst_ptr, |
561 int pitch, int stride) { | 552 int pitch, int stride) { |
562 int a1 = ((input_dc + 16) >> 5); | 553 int a1 = ((input_dc + 16) >> 5); |
563 int r, c, b; | 554 int r, c, b; |
564 unsigned char *orig_pred = pred_ptr; | 555 uint8_t *orig_pred = pred_ptr; |
565 unsigned char *orig_dst = dst_ptr; | 556 uint8_t *orig_dst = dst_ptr; |
566 for (b = 0; b < 4; b++) { | 557 for (b = 0; b < 4; b++) { |
567 for (r = 0; r < 4; r++) { | 558 for (r = 0; r < 4; r++) { |
568 for (c = 0; c < 4; c++) { | 559 for (c = 0; c < 4; c++) { |
569 int a = a1 + pred_ptr[c]; | 560 dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); |
570 | |
571 if (a < 0) | |
572 a = 0; | |
573 | |
574 if (a > 255) | |
575 a = 255; | |
576 | |
577 dst_ptr[c] = (unsigned char) a; | |
578 } | 561 } |
579 | 562 |
580 dst_ptr += stride; | 563 dst_ptr += stride; |
581 pred_ptr += pitch; | 564 pred_ptr += pitch; |
582 } | 565 } |
583 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; | 566 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; |
584 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; | 567 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; |
585 } | 568 } |
586 } | 569 } |
587 | 570 |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
655 * | 638 * |
656 * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ | 639 * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ |
657 static void idctcol(int *blk) { | 640 static void idctcol(int *blk) { |
658 int x0, x1, x2, x3, x4, x5, x6, x7, x8; | 641 int x0, x1, x2, x3, x4, x5, x6, x7, x8; |
659 | 642 |
660 /* shortcut */ | 643 /* shortcut */ |
661 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | | 644 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | |
662 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | | 645 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | |
663 (x7 = blk[8 * 3]))) { | 646 (x7 = blk[8 * 3]))) { |
664 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] | 647 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] |
665 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6
] | 648 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] |
666 = blk[8 *
7] = ((blk[8 * 0] + 32) >> 6); | 649 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); |
667 return; | 650 return; |
668 } | 651 } |
669 | 652 |
670 x0 = (blk[8 * 0] << 8) + 16384; | 653 x0 = (blk[8 * 0] << 8) + 16384; |
671 | 654 |
672 /* first stage */ | 655 /* first stage */ |
673 x8 = W7 * (x4 + x5) + 4; | 656 x8 = W7 * (x4 + x5) + 4; |
674 x4 = (x8 + (W1 - W7) * x4) >> 3; | 657 x4 = (x8 + (W1 - W7) * x4) >> 3; |
675 x5 = (x8 - (W1 + W7) * x5) >> 3; | 658 x5 = (x8 - (W1 + W7) * x5) >> 3; |
676 x8 = W3 * (x6 + x7) + 4; | 659 x8 = W3 * (x6 + x7) + 4; |
(...skipping 24 matching lines...) Expand all Loading... |
701 blk[8 * 1] = (x3 + x2) >> 14; | 684 blk[8 * 1] = (x3 + x2) >> 14; |
702 blk[8 * 2] = (x0 + x4) >> 14; | 685 blk[8 * 2] = (x0 + x4) >> 14; |
703 blk[8 * 3] = (x8 + x6) >> 14; | 686 blk[8 * 3] = (x8 + x6) >> 14; |
704 blk[8 * 4] = (x8 - x6) >> 14; | 687 blk[8 * 4] = (x8 - x6) >> 14; |
705 blk[8 * 5] = (x0 - x4) >> 14; | 688 blk[8 * 5] = (x0 - x4) >> 14; |
706 blk[8 * 6] = (x3 - x2) >> 14; | 689 blk[8 * 6] = (x3 - x2) >> 14; |
707 blk[8 * 7] = (x7 - x1) >> 14; | 690 blk[8 * 7] = (x7 - x1) >> 14; |
708 } | 691 } |
709 | 692 |
710 #define TX_DIM 8 | 693 #define TX_DIM 8 |
711 void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) { | 694 void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) { |
712 int X[TX_DIM * TX_DIM]; | 695 int X[TX_DIM * TX_DIM]; |
713 int i, j; | 696 int i, j; |
714 int shortpitch = pitch >> 1; | 697 int shortpitch = pitch >> 1; |
715 | 698 |
716 for (i = 0; i < TX_DIM; i++) { | 699 for (i = 0; i < TX_DIM; i++) { |
717 for (j = 0; j < TX_DIM; j++) { | 700 for (j = 0; j < TX_DIM; j++) { |
718 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 | 701 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 |
719 + (coefs[i * TX_DIM + j] < 0)) >> 2; | 702 + (coefs[i * TX_DIM + j] < 0)) >> 2; |
720 } | 703 } |
721 } | 704 } |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
820 blk[8 * 0] = (x7 + x1) >> 14; | 803 blk[8 * 0] = (x7 + x1) >> 14; |
821 blk[8 * 1] = (x3 + x2) >> 14; | 804 blk[8 * 1] = (x3 + x2) >> 14; |
822 blk[8 * 2] = (x0 + x4) >> 14; | 805 blk[8 * 2] = (x0 + x4) >> 14; |
823 blk[8 * 3] = (x8 + x6) >> 14; | 806 blk[8 * 3] = (x8 + x6) >> 14; |
824 blk[8 * 4] = (x8 - x6) >> 14; | 807 blk[8 * 4] = (x8 - x6) >> 14; |
825 blk[8 * 5] = (x0 - x4) >> 14; | 808 blk[8 * 5] = (x0 - x4) >> 14; |
826 blk[8 * 6] = (x3 - x2) >> 14; | 809 blk[8 * 6] = (x3 - x2) >> 14; |
827 blk[8 * 7] = (x7 - x1) >> 14; | 810 blk[8 * 7] = (x7 - x1) >> 14; |
828 } | 811 } |
829 | 812 |
830 void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) { | 813 void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) { |
831 int X[TX_DIM * TX_DIM]; | 814 int X[TX_DIM * TX_DIM]; |
832 int i, j; | 815 int i, j; |
833 int shortpitch = pitch >> 1; | 816 int shortpitch = pitch >> 1; |
834 | 817 |
835 for (i = 0; i < TX_DIM; i++) { | 818 for (i = 0; i < TX_DIM; i++) { |
836 for (j = 0; j < TX_DIM; j++) { | 819 for (j = 0; j < TX_DIM; j++) { |
837 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 | 820 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 |
838 + (coefs[i * TX_DIM + j] < 0)) >> 2; | 821 + (coefs[i * TX_DIM + j] < 0)) >> 2; |
839 } | 822 } |
840 } | 823 } |
841 | 824 |
842 /* Do first 4 row idct only since non-zero dct coefficients are all in | 825 /* Do first 4 row idct only since non-zero dct coefficients are all in |
843 * upper-left 4x4 area. */ | 826 * upper-left 4x4 area. */ |
844 for (i = 0; i < 4; i++) | 827 for (i = 0; i < 4; i++) |
845 idctrow10(X + 8 * i); | 828 idctrow10(X + 8 * i); |
846 | 829 |
847 for (i = 0; i < 8; i++) | 830 for (i = 0; i < 8; i++) |
848 idctcol10(X + i); | 831 idctcol10(X + i); |
849 | 832 |
850 for (i = 0; i < TX_DIM; i++) { | 833 for (i = 0; i < TX_DIM; i++) { |
851 for (j = 0; j < TX_DIM; j++) { | 834 for (j = 0; j < TX_DIM; j++) { |
852 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; | 835 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; |
853 } | 836 } |
854 } | 837 } |
855 } | 838 } |
856 | 839 |
857 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) { | 840 void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) { |
858 int i; | 841 int i; |
859 short *ip = input; // 0,1, 4, 8 | 842 int16_t *ip = input; // 0, 1, 4, 8 |
860 short *op = output; | 843 int16_t *op = output; |
861 for (i = 0; i < 16; i++) { | 844 for (i = 0; i < 16; i++) { |
862 op[i] = 0; | 845 op[i] = 0; |
863 } | 846 } |
864 | 847 |
865 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; | 848 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; |
866 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; | 849 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; |
867 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; | 850 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; |
868 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; | 851 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; |
869 } | 852 } |
870 | 853 |
871 | 854 |
872 #if 0 | 855 #if 0 |
873 // Keep a really bad float version as reference for now. | 856 // Keep a really bad float version as reference for now. |
874 void vp9_short_idct16x16_c(short *input, short *output, int pitch) { | 857 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { |
875 | 858 |
876 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 859 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
877 { | 860 { |
878 double x; | 861 double x; |
879 const int short_pitch = pitch >> 1; | 862 const int short_pitch = pitch >> 1; |
880 int i, j, k, l; | 863 int i, j, k, l; |
881 for (l = 0; l < 16; ++l) { | 864 for (l = 0; l < 16; ++l) { |
882 for (k = 0; k < 16; ++k) { | 865 for (k = 0; k < 16; ++k) { |
883 double s = 0; | 866 double s = 0; |
884 for (i = 0; i < 16; ++i) { | 867 for (i = 0; i < 16; ++i) { |
885 for (j = 0; j < 16; ++j) { | 868 for (j = 0; j < 16; ++j) { |
886 x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; | 869 x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; |
887 if (i != 0) | 870 if (i != 0) |
888 x *= sqrt(2.0); | 871 x *= sqrt(2.0); |
889 if (j != 0) | 872 if (j != 0) |
890 x *= sqrt(2.0); | 873 x *= sqrt(2.0); |
891 s += x; | 874 s += x; |
892 } | 875 } |
893 } | 876 } |
894 output[k*short_pitch+l] = (short)round(s); | 877 output[k*short_pitch+l] = (short)round(s); |
895 } | 878 } |
896 } | 879 } |
897 } | 880 } |
898 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 881 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
899 } | 882 } |
900 #endif | 883 #endif |
901 | 884 |
902 #define TEST_INT_16x16_IDCT 1 | 885 #define TEST_INT_16x16_IDCT 1 |
903 #if !TEST_INT_16x16_IDCT | 886 #if !TEST_INT_16x16_IDCT |
904 static const double C1 = 0.995184726672197; | |
905 static const double C2 = 0.98078528040323; | |
906 static const double C3 = 0.956940335732209; | |
907 static const double C4 = 0.923879532511287; | |
908 static const double C5 = 0.881921264348355; | |
909 static const double C6 = 0.831469612302545; | |
910 static const double C7 = 0.773010453362737; | |
911 static const double C8 = 0.707106781186548; | |
912 static const double C9 = 0.634393284163646; | |
913 static const double C10 = 0.555570233019602; | |
914 static const double C11 = 0.471396736825998; | |
915 static const double C12 = 0.38268343236509; | |
916 static const double C13 = 0.290284677254462; | |
917 static const double C14 = 0.195090322016128; | |
918 static const double C15 = 0.098017140329561; | |
919 | |
920 | 887 |
921 static void butterfly_16x16_idct_1d(double input[16], double output[16]) { | 888 static void butterfly_16x16_idct_1d(double input[16], double output[16]) { |
922 | 889 |
| 890 static const double C1 = 0.995184726672197; |
| 891 static const double C2 = 0.98078528040323; |
| 892 static const double C3 = 0.956940335732209; |
| 893 static const double C4 = 0.923879532511287; |
| 894 static const double C5 = 0.881921264348355; |
| 895 static const double C6 = 0.831469612302545; |
| 896 static const double C7 = 0.773010453362737; |
| 897 static const double C8 = 0.707106781186548; |
| 898 static const double C9 = 0.634393284163646; |
| 899 static const double C10 = 0.555570233019602; |
| 900 static const double C11 = 0.471396736825998; |
| 901 static const double C12 = 0.38268343236509; |
| 902 static const double C13 = 0.290284677254462; |
| 903 static const double C14 = 0.195090322016128; |
| 904 static const double C15 = 0.098017140329561; |
| 905 |
923 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 906 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
924 { | 907 { |
925 double step[16]; | 908 double step[16]; |
926 double intermediate[16]; | 909 double intermediate[16]; |
927 double temp1, temp2; | 910 double temp1, temp2; |
928 | 911 |
929 | 912 |
930 // step 1 and 2 | 913 // step 1 and 2 |
931 step[ 0] = input[0] + input[8]; | 914 step[ 0] = input[0] + input[8]; |
932 step[ 1] = input[0] - input[8]; | 915 step[ 1] = input[0] - input[8]; |
(...skipping 191 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1124 output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); | 1107 output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); |
1125 if (n == 0) | 1108 if (n == 0) |
1126 output[k] = output[k]/kSqrt2; | 1109 output[k] = output[k]/kSqrt2; |
1127 } | 1110 } |
1128 } | 1111 } |
1129 } | 1112 } |
1130 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 1113 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
1131 } | 1114 } |
1132 #endif | 1115 #endif |
1133 | 1116 |
1134 void vp9_short_idct16x16_c(short *input, short *output, int pitch) { | 1117 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { |
1135 | 1118 |
1136 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 1119 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
1137 { | 1120 { |
1138 double out[16*16], out2[16*16]; | 1121 double out[16*16], out2[16*16]; |
1139 const int short_pitch = pitch >> 1; | 1122 const int short_pitch = pitch >> 1; |
1140 int i, j; | 1123 int i, j; |
1141 // First transform rows | 1124 // First transform rows |
1142 for (i = 0; i < 16; ++i) { | 1125 for (i = 0; i < 16; ++i) { |
1143 double temp_in[16], temp_out[16]; | 1126 double temp_in[16], temp_out[16]; |
1144 for (j = 0; j < 16; ++j) | 1127 for (j = 0; j < 16; ++j) |
(...skipping 11 matching lines...) Expand all Loading... |
1156 for (j = 0; j < 16; ++j) | 1139 for (j = 0; j < 16; ++j) |
1157 out2[j*16 + i] = temp_out[j]; | 1140 out2[j*16 + i] = temp_out[j]; |
1158 } | 1141 } |
1159 for (i = 0; i < 16*16; ++i) | 1142 for (i = 0; i < 16*16; ++i) |
1160 output[i] = round(out2[i]/128); | 1143 output[i] = round(out2[i]/128); |
1161 } | 1144 } |
1162 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 1145 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
1163 } | 1146 } |
1164 | 1147 |
1165 #else | 1148 #else |
| 1149 |
| 1150 #define INITIAL_SHIFT 2 |
| 1151 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1)) |
| 1152 #define RIGHT_SHIFT 14 |
| 1153 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1)) |
| 1154 |
1166 static const int16_t C1 = 16305; | 1155 static const int16_t C1 = 16305; |
1167 static const int16_t C2 = 16069; | 1156 static const int16_t C2 = 16069; |
1168 static const int16_t C3 = 15679; | 1157 static const int16_t C3 = 15679; |
1169 static const int16_t C4 = 15137; | 1158 static const int16_t C4 = 15137; |
1170 static const int16_t C5 = 14449; | 1159 static const int16_t C5 = 14449; |
1171 static const int16_t C6 = 13623; | 1160 static const int16_t C6 = 13623; |
1172 static const int16_t C7 = 12665; | 1161 static const int16_t C7 = 12665; |
1173 static const int16_t C8 = 11585; | 1162 static const int16_t C8 = 11585; |
1174 static const int16_t C9 = 10394; | 1163 static const int16_t C9 = 10394; |
1175 static const int16_t C10 = 9102; | 1164 static const int16_t C10 = 9102; |
1176 static const int16_t C11 = 7723; | 1165 static const int16_t C11 = 7723; |
1177 static const int16_t C12 = 6270; | 1166 static const int16_t C12 = 6270; |
1178 static const int16_t C13 = 4756; | 1167 static const int16_t C13 = 4756; |
1179 static const int16_t C14 = 3196; | 1168 static const int16_t C14 = 3196; |
1180 static const int16_t C15 = 1606; | 1169 static const int16_t C15 = 1606; |
1181 | 1170 |
1182 #define INITIAL_SHIFT 2 | |
1183 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1)) | |
1184 #define RIGHT_SHIFT 14 | |
1185 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1)) | |
1186 | |
1187 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], | 1171 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], |
1188 int last_shift_bits) { | 1172 int last_shift_bits) { |
1189 int16_t step[16]; | 1173 int16_t step[16]; |
1190 int intermediate[16]; | 1174 int intermediate[16]; |
1191 int temp1, temp2; | 1175 int temp1, temp2; |
1192 | 1176 |
1193 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; | 1177 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; |
1194 int step1_rounding = 1 << (step1_shift - 1); | 1178 int step1_rounding = 1 << (step1_shift - 1); |
1195 int last_rounding = 0; | 1179 int last_rounding = 0; |
1196 | 1180 |
1197 if (last_shift_bits > 0) | 1181 if (last_shift_bits > 0) |
1198 last_rounding = 1 << (last_shift_bits - 1); | 1182 last_rounding = 1 << (last_shift_bits - 1); |
1199 | 1183 |
1200 // step 1 and 2 | 1184 // step 1 and 2 |
1201 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1185 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1202 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1186 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1203 | 1187 |
1204 temp1 = input[4] * C12; | 1188 temp1 = input[4] * C12; |
1205 temp2 = input[12] * C4; | 1189 temp2 = input[12] * C4; |
1206 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1190 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1207 temp1 *= C8; | 1191 temp1 *= C8; |
1208 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; | 1192 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; |
1209 | 1193 |
1210 temp1 = input[4] * C4; | 1194 temp1 = input[4] * C4; |
1211 temp2 = input[12] * C12; | 1195 temp2 = input[12] * C12; |
1212 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1196 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1213 temp1 *= C8; | 1197 temp1 *= C8; |
1214 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; | 1198 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; |
1215 | 1199 |
1216 temp1 = input[2] * C8; | 1200 temp1 = input[2] * C8; |
1217 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1201 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1218 temp2 = input[6] + input[10]; | 1202 temp2 = input[6] + input[10]; |
1219 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1203 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1220 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1204 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1221 | 1205 |
1222 temp1 = input[14] * C8; | 1206 temp1 = input[14] * C8; |
1223 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1207 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1224 temp2 = input[6] - input[10]; | 1208 temp2 = input[6] - input[10]; |
1225 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1209 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1226 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; | 1210 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; |
1227 | 1211 |
1228 // for odd input | 1212 // for odd input |
1229 temp1 = input[3] * C12; | 1213 temp1 = input[3] * C12; |
1230 temp2 = input[13] * C4; | 1214 temp2 = input[13] * C4; |
1231 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1215 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1232 temp1 *= C8; | 1216 temp1 *= C8; |
1233 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1217 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1234 | 1218 |
1235 temp1 = input[3] * C4; | 1219 temp1 = input[3] * C4; |
1236 temp2 = input[13] * C12; | 1220 temp2 = input[13] * C12; |
1237 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1221 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1238 temp2 *= C8; | 1222 temp2 *= C8; |
1239 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1223 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1240 | 1224 |
1241 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1225 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1242 intermediate[11] = input[15] - input[1]; | 1226 intermediate[11] = input[15] - input[1]; |
1243 intermediate[12] = input[15] + input[1]; | 1227 intermediate[12] = input[15] + input[1]; |
1244 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1228 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1245 | 1229 |
1246 temp1 = input[11] * C12; | 1230 temp1 = input[11] * C12; |
1247 temp2 = input[5] * C4; | 1231 temp2 = input[5] * C4; |
1248 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1232 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1249 temp2 *= C8; | 1233 temp2 *= C8; |
1250 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1234 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1251 | 1235 |
1252 temp1 = input[11] * C4; | 1236 temp1 = input[11] * C4; |
1253 temp2 = input[5] * C12; | 1237 temp2 = input[5] * C12; |
1254 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1238 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1255 temp1 *= C8; | 1239 temp1 *= C8; |
1256 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1240 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1257 | 1241 |
1258 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) | 1242 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) |
1259 >> INITIAL_SHIFT; | 1243 >> INITIAL_SHIFT; |
1260 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) | 1244 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) |
1261 >> INITIAL_SHIFT; | 1245 >> INITIAL_SHIFT; |
1262 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) | 1246 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) |
1263 >> INITIAL_SHIFT; | 1247 >> INITIAL_SHIFT; |
1264 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) | 1248 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) |
1265 >> INITIAL_SHIFT; | 1249 >> INITIAL_SHIFT; |
1266 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) | 1250 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) |
1267 >> INITIAL_SHIFT; | 1251 >> INITIAL_SHIFT; |
1268 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) | 1252 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) |
1269 >> INITIAL_SHIFT; | 1253 >> INITIAL_SHIFT; |
1270 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) | 1254 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) |
1271 >> INITIAL_SHIFT; | 1255 >> INITIAL_SHIFT; |
1272 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) | 1256 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) |
1273 >> INITIAL_SHIFT; | 1257 >> INITIAL_SHIFT; |
1274 | 1258 |
1275 // step 3 | 1259 // step 3 |
1276 output[0] = step[ 0] + step[ 3]; | 1260 output[0] = step[ 0] + step[ 3]; |
1277 output[1] = step[ 1] + step[ 2]; | 1261 output[1] = step[ 1] + step[ 2]; |
1278 output[2] = step[ 1] - step[ 2]; | 1262 output[2] = step[ 1] - step[ 2]; |
1279 output[3] = step[ 0] - step[ 3]; | 1263 output[3] = step[ 0] - step[ 3]; |
1280 | 1264 |
1281 temp1 = step[ 4] * C14; | 1265 temp1 = step[ 4] * C14; |
1282 temp2 = step[ 7] * C2; | 1266 temp2 = step[ 7] * C2; |
1283 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1267 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1284 | 1268 |
1285 temp1 = step[ 4] * C2; | 1269 temp1 = step[ 4] * C2; |
1286 temp2 = step[ 7] * C14; | 1270 temp2 = step[ 7] * C14; |
1287 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1271 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1288 | 1272 |
1289 temp1 = step[ 5] * C10; | 1273 temp1 = step[ 5] * C10; |
1290 temp2 = step[ 6] * C6; | 1274 temp2 = step[ 6] * C6; |
1291 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1275 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1292 | 1276 |
1293 temp1 = step[ 5] * C6; | 1277 temp1 = step[ 5] * C6; |
1294 temp2 = step[ 6] * C10; | 1278 temp2 = step[ 6] * C10; |
1295 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1279 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1296 | 1280 |
1297 output[8] = step[ 8] + step[11]; | 1281 output[8] = step[ 8] + step[11]; |
1298 output[9] = step[ 9] + step[10]; | 1282 output[9] = step[ 9] + step[10]; |
1299 output[10] = step[ 9] - step[10]; | 1283 output[10] = step[ 9] - step[10]; |
1300 output[11] = step[ 8] - step[11]; | 1284 output[11] = step[ 8] - step[11]; |
1301 output[12] = step[12] + step[15]; | 1285 output[12] = step[12] + step[15]; |
1302 output[13] = step[13] + step[14]; | 1286 output[13] = step[13] + step[14]; |
1303 output[14] = step[13] - step[14]; | 1287 output[14] = step[13] - step[14]; |
1304 output[15] = step[12] - step[15]; | 1288 output[15] = step[12] - step[15]; |
1305 | 1289 |
1306 // output 4 | 1290 // output 4 |
1307 step[ 0] = output[0] + output[7]; | 1291 step[ 0] = output[0] + output[7]; |
1308 step[ 1] = output[1] + output[6]; | 1292 step[ 1] = output[1] + output[6]; |
1309 step[ 2] = output[2] + output[5]; | 1293 step[ 2] = output[2] + output[5]; |
1310 step[ 3] = output[3] + output[4]; | 1294 step[ 3] = output[3] + output[4]; |
1311 step[ 4] = output[3] - output[4]; | 1295 step[ 4] = output[3] - output[4]; |
1312 step[ 5] = output[2] - output[5]; | 1296 step[ 5] = output[2] - output[5]; |
1313 step[ 6] = output[1] - output[6]; | 1297 step[ 6] = output[1] - output[6]; |
1314 step[ 7] = output[0] - output[7]; | 1298 step[ 7] = output[0] - output[7]; |
1315 | 1299 |
1316 temp1 = output[8] * C7; | 1300 temp1 = output[8] * C7; |
1317 temp2 = output[15] * C9; | 1301 temp2 = output[15] * C9; |
1318 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1302 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1319 | 1303 |
1320 temp1 = output[9] * C11; | 1304 temp1 = output[9] * C11; |
1321 temp2 = output[14] * C5; | 1305 temp2 = output[14] * C5; |
1322 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1306 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1323 | 1307 |
1324 temp1 = output[10] * C3; | 1308 temp1 = output[10] * C3; |
1325 temp2 = output[13] * C13; | 1309 temp2 = output[13] * C13; |
1326 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1310 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1327 | 1311 |
1328 temp1 = output[11] * C15; | 1312 temp1 = output[11] * C15; |
1329 temp2 = output[12] * C1; | 1313 temp2 = output[12] * C1; |
1330 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1314 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1331 | 1315 |
1332 temp1 = output[11] * C1; | 1316 temp1 = output[11] * C1; |
1333 temp2 = output[12] * C15; | 1317 temp2 = output[12] * C15; |
1334 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1318 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1335 | 1319 |
1336 temp1 = output[10] * C13; | 1320 temp1 = output[10] * C13; |
1337 temp2 = output[13] * C3; | 1321 temp2 = output[13] * C3; |
1338 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1322 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1339 | 1323 |
1340 temp1 = output[9] * C5; | 1324 temp1 = output[9] * C5; |
1341 temp2 = output[14] * C11; | 1325 temp2 = output[14] * C11; |
1342 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1326 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1343 | 1327 |
1344 temp1 = output[8] * C9; | 1328 temp1 = output[8] * C9; |
1345 temp2 = output[15] * C7; | 1329 temp2 = output[15] * C7; |
1346 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; | 1330 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; |
1347 | 1331 |
1348 // step 5 | 1332 // step 5 |
1349 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; | 1333 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; |
1350 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; | 1334 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; |
1351 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; | 1335 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; |
1352 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; | 1336 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; |
1353 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; | 1337 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; |
1354 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; | 1338 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; |
1355 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; | 1339 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; |
1356 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; | 1340 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; |
1357 | 1341 |
1358 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; | 1342 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; |
1359 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; | 1343 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; |
1360 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; | 1344 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; |
1361 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; | 1345 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; |
1362 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; | 1346 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; |
1363 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; | 1347 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; |
1364 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; | 1348 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; |
1365 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; | 1349 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; |
1366 } | 1350 } |
1367 | 1351 |
1368 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { | 1352 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { |
1369 int16_t out[16 * 16]; | 1353 int16_t out[16 * 16]; |
1370 int16_t *outptr = &out[0]; | 1354 int16_t *outptr = &out[0]; |
1371 const int short_pitch = pitch >> 1; | 1355 const int short_pitch = pitch >> 1; |
1372 int i, j; | 1356 int i, j; |
1373 int16_t temp_in[16], temp_out[16]; | 1357 int16_t temp_in[16], temp_out[16]; |
1374 | 1358 |
1375 // First transform rows | 1359 // First transform rows |
1376 for (i = 0; i < 16; ++i) { | 1360 for (i = 0; i < 16; ++i) { |
1377 butterfly_16x16_idct_1d(input, outptr, 0); | 1361 butterfly_16x16_idct_1d(input, outptr, 0); |
1378 input += short_pitch; | 1362 input += short_pitch; |
1379 outptr += 16; | 1363 outptr += 16; |
1380 } | 1364 } |
1381 | 1365 |
1382 // Then transform columns | 1366 // Then transform columns |
1383 for (i = 0; i < 16; ++i) { | 1367 for (i = 0; i < 16; ++i) { |
1384 for (j = 0; j < 16; ++j) | 1368 for (j = 0; j < 16; ++j) |
1385 temp_in[j] = out[j * 16 + i]; | 1369 temp_in[j] = out[j * 16 + i]; |
1386 butterfly_16x16_idct_1d(temp_in, temp_out, 3); | 1370 butterfly_16x16_idct_1d(temp_in, temp_out, 3); |
1387 for (j = 0; j < 16; ++j) | 1371 for (j = 0; j < 16; ++j) |
1388 output[j * 16 + i] = temp_out[j]; | 1372 output[j * 16 + i] = temp_out[j]; |
1389 } | 1373 } |
1390 } | 1374 } |
1391 | 1375 |
1392 /* The following function is called when we know the maximum number of non-zero | 1376 /* The following function is called when we know the maximum number of non-zero |
1393 * dct coefficients is less or equal 10. | 1377 * dct coefficients is less or equal 10. |
1394 */ | 1378 */ |
1395 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], | 1379 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], |
1396 int last_shift_bits) { | 1380 int last_shift_bits) { |
1397 int16_t step[16] = {0}; | 1381 int16_t step[16] = {0}; |
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1541 butterfly_16x16_idct10_1d(temp_in, temp_out, 3); | 1525 butterfly_16x16_idct10_1d(temp_in, temp_out, 3); |
1542 for (j = 0; j < 16; ++j) | 1526 for (j = 0; j < 16; ++j) |
1543 output[j*16 + i] = temp_out[j]; | 1527 output[j*16 + i] = temp_out[j]; |
1544 } | 1528 } |
1545 } | 1529 } |
1546 #undef INITIAL_SHIFT | 1530 #undef INITIAL_SHIFT |
1547 #undef INITIAL_ROUNDING | 1531 #undef INITIAL_ROUNDING |
1548 #undef RIGHT_SHIFT | 1532 #undef RIGHT_SHIFT |
1549 #undef RIGHT_ROUNDING | 1533 #undef RIGHT_ROUNDING |
1550 #endif | 1534 #endif |
| 1535 |
| 1536 #if !CONFIG_DWTDCTHYBRID |
| 1537 #define DownshiftMultiplyBy2(x) x * 2 |
| 1538 #define DownshiftMultiply(x) x |
| 1539 |
| 1540 static void idct16(double *input, double *output, int stride) { |
| 1541 static const double C1 = 0.995184726672197; |
| 1542 static const double C2 = 0.98078528040323; |
| 1543 static const double C3 = 0.956940335732209; |
| 1544 static const double C4 = 0.923879532511287; |
| 1545 static const double C5 = 0.881921264348355; |
| 1546 static const double C6 = 0.831469612302545; |
| 1547 static const double C7 = 0.773010453362737; |
| 1548 static const double C8 = 0.707106781186548; |
| 1549 static const double C9 = 0.634393284163646; |
| 1550 static const double C10 = 0.555570233019602; |
| 1551 static const double C11 = 0.471396736825998; |
| 1552 static const double C12 = 0.38268343236509; |
| 1553 static const double C13 = 0.290284677254462; |
| 1554 static const double C14 = 0.195090322016128; |
| 1555 static const double C15 = 0.098017140329561; |
| 1556 |
| 1557 double step[16]; |
| 1558 double intermediate[16]; |
| 1559 double temp1, temp2; |
| 1560 |
| 1561 // step 1 and 2 |
| 1562 step[ 0] = input[stride*0] + input[stride*8]; |
| 1563 step[ 1] = input[stride*0] - input[stride*8]; |
| 1564 |
| 1565 temp1 = input[stride*4]*C12; |
| 1566 temp2 = input[stride*12]*C4; |
| 1567 |
| 1568 temp1 -= temp2; |
| 1569 temp1 = DownshiftMultiply(temp1); |
| 1570 temp1 *= C8; |
| 1571 |
| 1572 step[ 2] = DownshiftMultiplyBy2(temp1); |
| 1573 |
| 1574 temp1 = input[stride*4]*C4; |
| 1575 temp2 = input[stride*12]*C12; |
| 1576 temp1 += temp2; |
| 1577 temp1 = DownshiftMultiply(temp1); |
| 1578 temp1 *= C8; |
| 1579 step[ 3] = DownshiftMultiplyBy2(temp1); |
| 1580 |
| 1581 temp1 = input[stride*2]*C8; |
| 1582 temp1 = DownshiftMultiplyBy2(temp1); |
| 1583 temp2 = input[stride*6] + input[stride*10]; |
| 1584 |
| 1585 step[ 4] = temp1 + temp2; |
| 1586 step[ 5] = temp1 - temp2; |
| 1587 |
| 1588 temp1 = input[stride*14]*C8; |
| 1589 temp1 = DownshiftMultiplyBy2(temp1); |
| 1590 temp2 = input[stride*6] - input[stride*10]; |
| 1591 |
| 1592 step[ 6] = temp2 - temp1; |
| 1593 step[ 7] = temp2 + temp1; |
| 1594 |
| 1595 // for odd input |
| 1596 temp1 = input[stride*3]*C12; |
| 1597 temp2 = input[stride*13]*C4; |
| 1598 temp1 += temp2; |
| 1599 temp1 = DownshiftMultiply(temp1); |
| 1600 temp1 *= C8; |
| 1601 intermediate[ 8] = DownshiftMultiplyBy2(temp1); |
| 1602 |
| 1603 temp1 = input[stride*3]*C4; |
| 1604 temp2 = input[stride*13]*C12; |
| 1605 temp2 -= temp1; |
| 1606 temp2 = DownshiftMultiply(temp2); |
| 1607 temp2 *= C8; |
| 1608 intermediate[ 9] = DownshiftMultiplyBy2(temp2); |
| 1609 |
| 1610 intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8); |
| 1611 intermediate[11] = input[stride*15] - input[stride*1]; |
| 1612 intermediate[12] = input[stride*15] + input[stride*1]; |
| 1613 intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8)); |
| 1614 |
| 1615 temp1 = input[stride*11]*C12; |
| 1616 temp2 = input[stride*5]*C4; |
| 1617 temp2 -= temp1; |
| 1618 temp2 = DownshiftMultiply(temp2); |
| 1619 temp2 *= C8; |
| 1620 intermediate[14] = DownshiftMultiplyBy2(temp2); |
| 1621 |
| 1622 temp1 = input[stride*11]*C4; |
| 1623 temp2 = input[stride*5]*C12; |
| 1624 temp1 += temp2; |
| 1625 temp1 = DownshiftMultiply(temp1); |
| 1626 temp1 *= C8; |
| 1627 intermediate[15] = DownshiftMultiplyBy2(temp1); |
| 1628 |
| 1629 step[ 8] = intermediate[ 8] + intermediate[14]; |
| 1630 step[ 9] = intermediate[ 9] + intermediate[15]; |
| 1631 step[10] = intermediate[10] + intermediate[11]; |
| 1632 step[11] = intermediate[10] - intermediate[11]; |
| 1633 step[12] = intermediate[12] + intermediate[13]; |
| 1634 step[13] = intermediate[12] - intermediate[13]; |
| 1635 step[14] = intermediate[ 8] - intermediate[14]; |
| 1636 step[15] = intermediate[ 9] - intermediate[15]; |
| 1637 |
| 1638 // step 3 |
| 1639 output[stride*0] = step[ 0] + step[ 3]; |
| 1640 output[stride*1] = step[ 1] + step[ 2]; |
| 1641 output[stride*2] = step[ 1] - step[ 2]; |
| 1642 output[stride*3] = step[ 0] - step[ 3]; |
| 1643 |
| 1644 temp1 = step[ 4]*C14; |
| 1645 temp2 = step[ 7]*C2; |
| 1646 temp1 -= temp2; |
| 1647 output[stride*4] = DownshiftMultiply(temp1); |
| 1648 |
| 1649 temp1 = step[ 4]*C2; |
| 1650 temp2 = step[ 7]*C14; |
| 1651 temp1 += temp2; |
| 1652 output[stride*7] = DownshiftMultiply(temp1); |
| 1653 |
| 1654 temp1 = step[ 5]*C10; |
| 1655 temp2 = step[ 6]*C6; |
| 1656 temp1 -= temp2; |
| 1657 output[stride*5] = DownshiftMultiply(temp1); |
| 1658 |
| 1659 temp1 = step[ 5]*C6; |
| 1660 temp2 = step[ 6]*C10; |
| 1661 temp1 += temp2; |
| 1662 output[stride*6] = DownshiftMultiply(temp1); |
| 1663 |
| 1664 output[stride*8] = step[ 8] + step[11]; |
| 1665 output[stride*9] = step[ 9] + step[10]; |
| 1666 output[stride*10] = step[ 9] - step[10]; |
| 1667 output[stride*11] = step[ 8] - step[11]; |
| 1668 output[stride*12] = step[12] + step[15]; |
| 1669 output[stride*13] = step[13] + step[14]; |
| 1670 output[stride*14] = step[13] - step[14]; |
| 1671 output[stride*15] = step[12] - step[15]; |
| 1672 |
| 1673 // output 4 |
| 1674 step[ 0] = output[stride*0] + output[stride*7]; |
| 1675 step[ 1] = output[stride*1] + output[stride*6]; |
| 1676 step[ 2] = output[stride*2] + output[stride*5]; |
| 1677 step[ 3] = output[stride*3] + output[stride*4]; |
| 1678 step[ 4] = output[stride*3] - output[stride*4]; |
| 1679 step[ 5] = output[stride*2] - output[stride*5]; |
| 1680 step[ 6] = output[stride*1] - output[stride*6]; |
| 1681 step[ 7] = output[stride*0] - output[stride*7]; |
| 1682 |
| 1683 temp1 = output[stride*8]*C7; |
| 1684 temp2 = output[stride*15]*C9; |
| 1685 temp1 -= temp2; |
| 1686 step[ 8] = DownshiftMultiply(temp1); |
| 1687 |
| 1688 temp1 = output[stride*9]*C11; |
| 1689 temp2 = output[stride*14]*C5; |
| 1690 temp1 += temp2; |
| 1691 step[ 9] = DownshiftMultiply(temp1); |
| 1692 |
| 1693 temp1 = output[stride*10]*C3; |
| 1694 temp2 = output[stride*13]*C13; |
| 1695 temp1 -= temp2; |
| 1696 step[10] = DownshiftMultiply(temp1); |
| 1697 |
| 1698 temp1 = output[stride*11]*C15; |
| 1699 temp2 = output[stride*12]*C1; |
| 1700 temp1 += temp2; |
| 1701 step[11] = DownshiftMultiply(temp1); |
| 1702 |
| 1703 temp1 = output[stride*11]*C1; |
| 1704 temp2 = output[stride*12]*C15; |
| 1705 temp2 -= temp1; |
| 1706 step[12] = DownshiftMultiply(temp2); |
| 1707 |
| 1708 temp1 = output[stride*10]*C13; |
| 1709 temp2 = output[stride*13]*C3; |
| 1710 temp1 += temp2; |
| 1711 step[13] = DownshiftMultiply(temp1); |
| 1712 |
| 1713 temp1 = output[stride*9]*C5; |
| 1714 temp2 = output[stride*14]*C11; |
| 1715 temp2 -= temp1; |
| 1716 step[14] = DownshiftMultiply(temp2); |
| 1717 |
| 1718 temp1 = output[stride*8]*C9; |
| 1719 temp2 = output[stride*15]*C7; |
| 1720 temp1 += temp2; |
| 1721 step[15] = DownshiftMultiply(temp1); |
| 1722 |
| 1723 // step 5 |
| 1724 output[stride*0] = step[0] + step[15]; |
| 1725 output[stride*1] = step[1] + step[14]; |
| 1726 output[stride*2] = step[2] + step[13]; |
| 1727 output[stride*3] = step[3] + step[12]; |
| 1728 output[stride*4] = step[4] + step[11]; |
| 1729 output[stride*5] = step[5] + step[10]; |
| 1730 output[stride*6] = step[6] + step[ 9]; |
| 1731 output[stride*7] = step[7] + step[ 8]; |
| 1732 |
| 1733 output[stride*15] = step[0] - step[15]; |
| 1734 output[stride*14] = step[1] - step[14]; |
| 1735 output[stride*13] = step[2] - step[13]; |
| 1736 output[stride*12] = step[3] - step[12]; |
| 1737 output[stride*11] = step[4] - step[11]; |
| 1738 output[stride*10] = step[5] - step[10]; |
| 1739 output[stride*9] = step[6] - step[ 9]; |
| 1740 output[stride*8] = step[7] - step[ 8]; |
| 1741 } |
| 1742 |
| 1743 static void butterfly_32_idct_1d(double *input, double *output, int stride) { |
| 1744 static const double C1 = 0.998795456205; // cos(pi * 1 / 64) |
| 1745 static const double C3 = 0.989176509965; // cos(pi * 3 / 64) |
| 1746 static const double C5 = 0.970031253195; // cos(pi * 5 / 64) |
| 1747 static const double C7 = 0.941544065183; // cos(pi * 7 / 64) |
| 1748 static const double C9 = 0.903989293123; // cos(pi * 9 / 64) |
| 1749 static const double C11 = 0.857728610000; // cos(pi * 11 / 64) |
| 1750 static const double C13 = 0.803207531481; // cos(pi * 13 / 64) |
| 1751 static const double C15 = 0.740951125355; // cos(pi * 15 / 64) |
| 1752 static const double C16 = 0.707106781187; // cos(pi * 16 / 64) |
| 1753 static const double C17 = 0.671558954847; // cos(pi * 17 / 64) |
| 1754 static const double C19 = 0.595699304492; // cos(pi * 19 / 64) |
| 1755 static const double C21 = 0.514102744193; // cos(pi * 21 / 64) |
| 1756 static const double C23 = 0.427555093430; // cos(pi * 23 / 64) |
| 1757 static const double C25 = 0.336889853392; // cos(pi * 25 / 64) |
| 1758 static const double C27 = 0.242980179903; // cos(pi * 27 / 64) |
| 1759 static const double C29 = 0.146730474455; // cos(pi * 29 / 64) |
| 1760 static const double C31 = 0.049067674327; // cos(pi * 31 / 64) |
| 1761 |
| 1762 double step1[32]; |
| 1763 double step2[32]; |
| 1764 |
| 1765 step1[ 0] = input[stride*0]; |
| 1766 step1[ 1] = input[stride*2]; |
| 1767 step1[ 2] = input[stride*4]; |
| 1768 step1[ 3] = input[stride*6]; |
| 1769 step1[ 4] = input[stride*8]; |
| 1770 step1[ 5] = input[stride*10]; |
| 1771 step1[ 6] = input[stride*12]; |
| 1772 step1[ 7] = input[stride*14]; |
| 1773 step1[ 8] = input[stride*16]; |
| 1774 step1[ 9] = input[stride*18]; |
| 1775 step1[10] = input[stride*20]; |
| 1776 step1[11] = input[stride*22]; |
| 1777 step1[12] = input[stride*24]; |
| 1778 step1[13] = input[stride*26]; |
| 1779 step1[14] = input[stride*28]; |
| 1780 step1[15] = input[stride*30]; |
| 1781 |
| 1782 step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16); |
| 1783 step1[17] = (input[stride*3] + input[stride*1]); |
| 1784 step1[18] = (input[stride*5] + input[stride*3]); |
| 1785 step1[19] = (input[stride*7] + input[stride*5]); |
| 1786 step1[20] = (input[stride*9] + input[stride*7]); |
| 1787 step1[21] = (input[stride*11] + input[stride*9]); |
| 1788 step1[22] = (input[stride*13] + input[stride*11]); |
| 1789 step1[23] = (input[stride*15] + input[stride*13]); |
| 1790 step1[24] = (input[stride*17] + input[stride*15]); |
| 1791 step1[25] = (input[stride*19] + input[stride*17]); |
| 1792 step1[26] = (input[stride*21] + input[stride*19]); |
| 1793 step1[27] = (input[stride*23] + input[stride*21]); |
| 1794 step1[28] = (input[stride*25] + input[stride*23]); |
| 1795 step1[29] = (input[stride*27] + input[stride*25]); |
| 1796 step1[30] = (input[stride*29] + input[stride*27]); |
| 1797 step1[31] = (input[stride*31] + input[stride*29]); |
| 1798 |
| 1799 idct16(step1, step2, 1); |
| 1800 idct16(step1 + 16, step2 + 16, 1); |
| 1801 |
| 1802 step2[16] = DownshiftMultiply(step2[16] / (2*C1)); |
| 1803 step2[17] = DownshiftMultiply(step2[17] / (2*C3)); |
| 1804 step2[18] = DownshiftMultiply(step2[18] / (2*C5)); |
| 1805 step2[19] = DownshiftMultiply(step2[19] / (2*C7)); |
| 1806 step2[20] = DownshiftMultiply(step2[20] / (2*C9)); |
| 1807 step2[21] = DownshiftMultiply(step2[21] / (2*C11)); |
| 1808 step2[22] = DownshiftMultiply(step2[22] / (2*C13)); |
| 1809 step2[23] = DownshiftMultiply(step2[23] / (2*C15)); |
| 1810 step2[24] = DownshiftMultiply(step2[24] / (2*C17)); |
| 1811 step2[25] = DownshiftMultiply(step2[25] / (2*C19)); |
| 1812 step2[26] = DownshiftMultiply(step2[26] / (2*C21)); |
| 1813 step2[27] = DownshiftMultiply(step2[27] / (2*C23)); |
| 1814 step2[28] = DownshiftMultiply(step2[28] / (2*C25)); |
| 1815 step2[29] = DownshiftMultiply(step2[29] / (2*C27)); |
| 1816 step2[30] = DownshiftMultiply(step2[30] / (2*C29)); |
| 1817 step2[31] = DownshiftMultiply(step2[31] / (2*C31)); |
| 1818 |
| 1819 output[stride* 0] = step2[ 0] + step2[16]; |
| 1820 output[stride* 1] = step2[ 1] + step2[17]; |
| 1821 output[stride* 2] = step2[ 2] + step2[18]; |
| 1822 output[stride* 3] = step2[ 3] + step2[19]; |
| 1823 output[stride* 4] = step2[ 4] + step2[20]; |
| 1824 output[stride* 5] = step2[ 5] + step2[21]; |
| 1825 output[stride* 6] = step2[ 6] + step2[22]; |
| 1826 output[stride* 7] = step2[ 7] + step2[23]; |
| 1827 output[stride* 8] = step2[ 8] + step2[24]; |
| 1828 output[stride* 9] = step2[ 9] + step2[25]; |
| 1829 output[stride*10] = step2[10] + step2[26]; |
| 1830 output[stride*11] = step2[11] + step2[27]; |
| 1831 output[stride*12] = step2[12] + step2[28]; |
| 1832 output[stride*13] = step2[13] + step2[29]; |
| 1833 output[stride*14] = step2[14] + step2[30]; |
| 1834 output[stride*15] = step2[15] + step2[31]; |
| 1835 output[stride*16] = step2[15] - step2[(31 - 0)]; |
| 1836 output[stride*17] = step2[14] - step2[(31 - 1)]; |
| 1837 output[stride*18] = step2[13] - step2[(31 - 2)]; |
| 1838 output[stride*19] = step2[12] - step2[(31 - 3)]; |
| 1839 output[stride*20] = step2[11] - step2[(31 - 4)]; |
| 1840 output[stride*21] = step2[10] - step2[(31 - 5)]; |
| 1841 output[stride*22] = step2[ 9] - step2[(31 - 6)]; |
| 1842 output[stride*23] = step2[ 8] - step2[(31 - 7)]; |
| 1843 output[stride*24] = step2[ 7] - step2[(31 - 8)]; |
| 1844 output[stride*25] = step2[ 6] - step2[(31 - 9)]; |
| 1845 output[stride*26] = step2[ 5] - step2[(31 - 10)]; |
| 1846 output[stride*27] = step2[ 4] - step2[(31 - 11)]; |
| 1847 output[stride*28] = step2[ 3] - step2[(31 - 12)]; |
| 1848 output[stride*29] = step2[ 2] - step2[(31 - 13)]; |
| 1849 output[stride*30] = step2[ 1] - step2[(31 - 14)]; |
| 1850 output[stride*31] = step2[ 0] - step2[(31 - 15)]; |
| 1851 } |
| 1852 |
| 1853 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { |
| 1854 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 1855 { |
| 1856 double out[32*32], out2[32*32]; |
| 1857 const int short_pitch = pitch >> 1; |
| 1858 int i, j; |
| 1859 // First transform rows |
| 1860 for (i = 0; i < 32; ++i) { |
| 1861 double temp_in[32], temp_out[32]; |
| 1862 for (j = 0; j < 32; ++j) |
| 1863 temp_in[j] = input[j + i*short_pitch]; |
| 1864 butterfly_32_idct_1d(temp_in, temp_out, 1); |
| 1865 for (j = 0; j < 32; ++j) |
| 1866 out[j + i*32] = temp_out[j]; |
| 1867 } |
| 1868 // Then transform columns |
| 1869 for (i = 0; i < 32; ++i) { |
| 1870 double temp_in[32], temp_out[32]; |
| 1871 for (j = 0; j < 32; ++j) |
| 1872 temp_in[j] = out[j*32 + i]; |
| 1873 butterfly_32_idct_1d(temp_in, temp_out, 1); |
| 1874 for (j = 0; j < 32; ++j) |
| 1875 out2[j*32 + i] = temp_out[j]; |
| 1876 } |
| 1877 for (i = 0; i < 32*32; ++i) |
| 1878 output[i] = round(out2[i]/128); |
| 1879 } |
| 1880 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 1881 } |
| 1882 |
| 1883 #else // !CONFIG_DWTDCTHYBRID |
| 1884 |
| 1885 #if DWT_TYPE == 53 |
| 1886 |
| 1887 // Note: block length must be even for this implementation |
| 1888 static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass, |
| 1889 int16_t *x) { |
| 1890 int16_t r, *a, *b; |
| 1891 int n; |
| 1892 |
| 1893 n = length >> 1; |
| 1894 b = highpass; |
| 1895 a = lowpass; |
| 1896 r = *highpass; |
| 1897 while (n--) { |
| 1898 *a++ -= (r + (*b) + 1) >> 1; |
| 1899 r = *b++; |
| 1900 } |
| 1901 |
| 1902 n = length >> 1; |
| 1903 b = highpass; |
| 1904 a = lowpass; |
| 1905 while (--n) { |
| 1906 *x++ = ((r = *a++) + 1) >> 1; |
| 1907 *x++ = *b++ + ((r + (*a) + 2) >> 2); |
| 1908 } |
| 1909 *x++ = ((r = *a) + 1) >> 1; |
| 1910 *x++ = *b + ((r + 1) >> 1); |
| 1911 } |
| 1912 |
| 1913 static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass, |
| 1914 int16_t *x) { |
| 1915 int16_t r, *a, *b; |
| 1916 int n; |
| 1917 |
| 1918 n = length >> 1; |
| 1919 b = highpass; |
| 1920 a = lowpass; |
| 1921 r = *highpass; |
| 1922 while (n--) { |
| 1923 *a++ -= (r + (*b) + 1) >> 1; |
| 1924 r = *b++; |
| 1925 } |
| 1926 |
| 1927 n = length >> 1; |
| 1928 b = highpass; |
| 1929 a = lowpass; |
| 1930 while (--n) { |
| 1931 r = *a++; |
| 1932 *x++ = r; |
| 1933 *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1); |
| 1934 } |
| 1935 *x++ = *a; |
| 1936 *x++ = ((*b) << 1) + *a; |
| 1937 } |
| 1938 |
| 1939 static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c, |
| 1940 int pitch_c, int16_t *x, int pitch_x) { |
| 1941 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; |
| 1942 short buffer[2 * DWT_MAX_LENGTH]; |
| 1943 |
| 1944 th[0] = hh; |
| 1945 tw[0] = hw; |
| 1946 for (i = 1; i <= levels; i++) { |
| 1947 th[i] = (th[i - 1] + 1) >> 1; |
| 1948 tw[i] = (tw[i - 1] + 1) >> 1; |
| 1949 } |
| 1950 for (lv = levels - 1; lv >= 0; lv--) { |
| 1951 nh = th[lv]; |
| 1952 nw = tw[lv]; |
| 1953 hh = th[lv + 1]; |
| 1954 hw = tw[lv + 1]; |
| 1955 if ((nh < 2) || (nw < 2)) continue; |
| 1956 for (j = 0; j < nw; j++) { |
| 1957 for (i = 0; i < nh; i++) |
| 1958 buffer[i] = c[i * pitch_c + j]; |
| 1959 synthesis_53_col(nh, buffer, buffer + hh, buffer + nh); |
| 1960 for (i = 0; i < nh; i++) |
| 1961 c[i * pitch_c + j] = buffer[i + nh]; |
| 1962 } |
| 1963 for (i = 0; i < nh; i++) { |
| 1964 memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); |
| 1965 synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]); |
| 1966 } |
| 1967 } |
| 1968 for (i = 0; i < height; i++) { |
| 1969 for (j = 0; j < width; j++) { |
| 1970 x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? |
| 1971 ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : |
| 1972 -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); |
| 1973 } |
| 1974 } |
| 1975 } |
| 1976 |
| 1977 #elif DWT_TYPE == 26 |
| 1978 |
| 1979 // Note: block length must be even for this implementation |
| 1980 static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass, |
| 1981 int16_t *x) { |
| 1982 int16_t r, s, *a, *b; |
| 1983 int i, n = length >> 1; |
| 1984 |
| 1985 if (n >= 4) { |
| 1986 a = lowpass; |
| 1987 b = highpass; |
| 1988 r = *lowpass; |
| 1989 while (--n) { |
| 1990 *b++ += (r - a[1] + 4) >> 3; |
| 1991 r = *a++; |
| 1992 } |
| 1993 *b += (r - *a + 4) >> 3; |
| 1994 } |
| 1995 a = lowpass; |
| 1996 b = highpass; |
| 1997 for (i = length >> 1; i; i--) { |
| 1998 s = *b++; |
| 1999 r = *a++; |
| 2000 *x++ = (r + s + 1) >> 1; |
| 2001 *x++ = (r - s + 1) >> 1; |
| 2002 } |
| 2003 } |
| 2004 |
| 2005 static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass, |
| 2006 int16_t *x) { |
| 2007 int16_t r, s, *a, *b; |
| 2008 int i, n = length >> 1; |
| 2009 |
| 2010 if (n >= 4) { |
| 2011 a = lowpass; |
| 2012 b = highpass; |
| 2013 r = *lowpass; |
| 2014 while (--n) { |
| 2015 *b++ += (r - a[1] + 4) >> 3; |
| 2016 r = *a++; |
| 2017 } |
| 2018 *b += (r - *a + 4) >> 3; |
| 2019 } |
| 2020 a = lowpass; |
| 2021 b = highpass; |
| 2022 for (i = length >> 1; i; i--) { |
| 2023 s = *b++; |
| 2024 r = *a++; |
| 2025 *x++ = r + s; |
| 2026 *x++ = r - s; |
| 2027 } |
| 2028 } |
| 2029 |
| 2030 static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c, |
| 2031 int pitch_c, int16_t *x, int pitch_x) { |
| 2032 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; |
| 2033 int16_t buffer[2 * DWT_MAX_LENGTH]; |
| 2034 |
| 2035 th[0] = hh; |
| 2036 tw[0] = hw; |
| 2037 for (i = 1; i <= levels; i++) { |
| 2038 th[i] = (th[i - 1] + 1) >> 1; |
| 2039 tw[i] = (tw[i - 1] + 1) >> 1; |
| 2040 } |
| 2041 for (lv = levels - 1; lv >= 0; lv--) { |
| 2042 nh = th[lv]; |
| 2043 nw = tw[lv]; |
| 2044 hh = th[lv + 1]; |
| 2045 hw = tw[lv + 1]; |
| 2046 if ((nh < 2) || (nw < 2)) continue; |
| 2047 for (j = 0; j < nw; j++) { |
| 2048 for (i = 0; i < nh; i++) |
| 2049 buffer[i] = c[i * pitch_c + j]; |
| 2050 synthesis_26_col(nh, buffer, buffer + hh, buffer + nh); |
| 2051 for (i = 0; i < nh; i++) |
| 2052 c[i * pitch_c + j] = buffer[i + nh]; |
| 2053 } |
| 2054 for (i = 0; i < nh; i++) { |
| 2055 memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); |
| 2056 synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]); |
| 2057 } |
| 2058 } |
| 2059 for (i = 0; i < height; i++) { |
| 2060 for (j = 0; j < width; j++) { |
| 2061 x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? |
| 2062 ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : |
| 2063 -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); |
| 2064 } |
| 2065 } |
| 2066 } |
| 2067 |
| 2068 #elif DWT_TYPE == 97 |
| 2069 |
| 2070 static void synthesis_97(int length, double *lowpass, double *highpass, |
| 2071 double *x) { |
| 2072 static const double a_predict1 = -1.586134342; |
| 2073 static const double a_update1 = -0.05298011854; |
| 2074 static const double a_predict2 = 0.8829110762; |
| 2075 static const double a_update2 = 0.4435068522; |
| 2076 static const double s_low = 1.149604398; |
| 2077 static const double s_high = 1/1.149604398; |
| 2078 static const double inv_s_low = 1 / s_low; |
| 2079 static const double inv_s_high = 1 / s_high; |
| 2080 int i; |
| 2081 double y[DWT_MAX_LENGTH]; |
| 2082 // Undo pack and scale |
| 2083 for (i = 0; i < length / 2; i++) { |
| 2084 y[i * 2] = lowpass[i] * inv_s_low; |
| 2085 y[i * 2 + 1] = highpass[i] * inv_s_high; |
| 2086 } |
| 2087 memcpy(x, y, sizeof(*y) * length); |
| 2088 // Undo update 2 |
| 2089 for (i = 2; i < length; i += 2) { |
| 2090 x[i] -= a_update2 * (x[i-1] + x[i+1]); |
| 2091 } |
| 2092 x[0] -= 2 * a_update2 * x[1]; |
| 2093 // Undo predict 2 |
| 2094 for (i = 1; i < length - 2; i += 2) { |
| 2095 x[i] -= a_predict2 * (x[i - 1] + x[i + 1]); |
| 2096 } |
| 2097 x[length - 1] -= 2 * a_predict2 * x[length - 2]; |
| 2098 // Undo update 1 |
| 2099 for (i = 2; i < length; i += 2) { |
| 2100 x[i] -= a_update1 * (x[i - 1] + x[i + 1]); |
| 2101 } |
| 2102 x[0] -= 2 * a_update1 * x[1]; |
| 2103 // Undo predict 1 |
| 2104 for (i = 1; i < length - 2; i += 2) { |
| 2105 x[i] -= a_predict1 * (x[i - 1] + x[i + 1]); |
| 2106 } |
| 2107 x[length - 1] -= 2 * a_predict1 * x[length - 2]; |
| 2108 } |
| 2109 |
| 2110 static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c, |
| 2111 int pitch_c, int16_t *x, int pitch_x) { |
| 2112 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; |
| 2113 double buffer[2 * DWT_MAX_LENGTH]; |
| 2114 double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; |
| 2115 |
| 2116 th[0] = hh; |
| 2117 tw[0] = hw; |
| 2118 for (i = 1; i <= levels; i++) { |
| 2119 th[i] = (th[i - 1] + 1) >> 1; |
| 2120 tw[i] = (tw[i - 1] + 1) >> 1; |
| 2121 } |
| 2122 for (lv = levels - 1; lv >= 0; lv--) { |
| 2123 nh = th[lv]; |
| 2124 nw = tw[lv]; |
| 2125 hh = th[lv + 1]; |
| 2126 hw = tw[lv + 1]; |
| 2127 if ((nh < 2) || (nw < 2)) continue; |
| 2128 for (j = 0; j < nw; j++) { |
| 2129 for (i = 0; i < nh; i++) |
| 2130 buffer[i] = c[i * pitch_c + j]; |
| 2131 synthesis_97(nh, buffer, buffer + hh, buffer + nh); |
| 2132 for (i = 0; i < nh; i++) |
| 2133 y[i * DWT_MAX_LENGTH + j] = buffer[i + nh]; |
| 2134 } |
| 2135 for (i = 0; i < nh; i++) { |
| 2136 memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); |
| 2137 synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]); |
| 2138 } |
| 2139 } |
| 2140 for (i = 0; i < height; i++) |
| 2141 for (j = 0; j < width; j++) |
| 2142 x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] / |
| 2143 (1 << DWT_PRECISION_BITS)); |
| 2144 } |
| 2145 |
| 2146 #endif // DWT_TYPE |
| 2147 |
| 2148 // TODO(debargha): Implement scaling differently so as not to have to use the |
| 2149 // floating point 16x16 dct |
| 2150 static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) { |
| 2151 static const double C1 = 0.995184726672197; |
| 2152 static const double C2 = 0.98078528040323; |
| 2153 static const double C3 = 0.956940335732209; |
| 2154 static const double C4 = 0.923879532511287; |
| 2155 static const double C5 = 0.881921264348355; |
| 2156 static const double C6 = 0.831469612302545; |
| 2157 static const double C7 = 0.773010453362737; |
| 2158 static const double C8 = 0.707106781186548; |
| 2159 static const double C9 = 0.634393284163646; |
| 2160 static const double C10 = 0.555570233019602; |
| 2161 static const double C11 = 0.471396736825998; |
| 2162 static const double C12 = 0.38268343236509; |
| 2163 static const double C13 = 0.290284677254462; |
| 2164 static const double C14 = 0.195090322016128; |
| 2165 static const double C15 = 0.098017140329561; |
| 2166 |
| 2167 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2168 { |
| 2169 double step[16]; |
| 2170 double intermediate[16]; |
| 2171 double temp1, temp2; |
| 2172 |
| 2173 |
| 2174 // step 1 and 2 |
| 2175 step[ 0] = input[0] + input[8]; |
| 2176 step[ 1] = input[0] - input[8]; |
| 2177 |
| 2178 temp1 = input[4]*C12; |
| 2179 temp2 = input[12]*C4; |
| 2180 |
| 2181 temp1 -= temp2; |
| 2182 temp1 *= C8; |
| 2183 |
| 2184 step[ 2] = 2*(temp1); |
| 2185 |
| 2186 temp1 = input[4]*C4; |
| 2187 temp2 = input[12]*C12; |
| 2188 temp1 += temp2; |
| 2189 temp1 = (temp1); |
| 2190 temp1 *= C8; |
| 2191 step[ 3] = 2*(temp1); |
| 2192 |
| 2193 temp1 = input[2]*C8; |
| 2194 temp1 = 2*(temp1); |
| 2195 temp2 = input[6] + input[10]; |
| 2196 |
| 2197 step[ 4] = temp1 + temp2; |
| 2198 step[ 5] = temp1 - temp2; |
| 2199 |
| 2200 temp1 = input[14]*C8; |
| 2201 temp1 = 2*(temp1); |
| 2202 temp2 = input[6] - input[10]; |
| 2203 |
| 2204 step[ 6] = temp2 - temp1; |
| 2205 step[ 7] = temp2 + temp1; |
| 2206 |
| 2207 // for odd input |
| 2208 temp1 = input[3]*C12; |
| 2209 temp2 = input[13]*C4; |
| 2210 temp1 += temp2; |
| 2211 temp1 = (temp1); |
| 2212 temp1 *= C8; |
| 2213 intermediate[ 8] = 2*(temp1); |
| 2214 |
| 2215 temp1 = input[3]*C4; |
| 2216 temp2 = input[13]*C12; |
| 2217 temp2 -= temp1; |
| 2218 temp2 = (temp2); |
| 2219 temp2 *= C8; |
| 2220 intermediate[ 9] = 2*(temp2); |
| 2221 |
| 2222 intermediate[10] = 2*(input[9]*C8); |
| 2223 intermediate[11] = input[15] - input[1]; |
| 2224 intermediate[12] = input[15] + input[1]; |
| 2225 intermediate[13] = 2*((input[7]*C8)); |
| 2226 |
| 2227 temp1 = input[11]*C12; |
| 2228 temp2 = input[5]*C4; |
| 2229 temp2 -= temp1; |
| 2230 temp2 = (temp2); |
| 2231 temp2 *= C8; |
| 2232 intermediate[14] = 2*(temp2); |
| 2233 |
| 2234 temp1 = input[11]*C4; |
| 2235 temp2 = input[5]*C12; |
| 2236 temp1 += temp2; |
| 2237 temp1 = (temp1); |
| 2238 temp1 *= C8; |
| 2239 intermediate[15] = 2*(temp1); |
| 2240 |
| 2241 step[ 8] = intermediate[ 8] + intermediate[14]; |
| 2242 step[ 9] = intermediate[ 9] + intermediate[15]; |
| 2243 step[10] = intermediate[10] + intermediate[11]; |
| 2244 step[11] = intermediate[10] - intermediate[11]; |
| 2245 step[12] = intermediate[12] + intermediate[13]; |
| 2246 step[13] = intermediate[12] - intermediate[13]; |
| 2247 step[14] = intermediate[ 8] - intermediate[14]; |
| 2248 step[15] = intermediate[ 9] - intermediate[15]; |
| 2249 |
| 2250 // step 3 |
| 2251 output[0] = step[ 0] + step[ 3]; |
| 2252 output[1] = step[ 1] + step[ 2]; |
| 2253 output[2] = step[ 1] - step[ 2]; |
| 2254 output[3] = step[ 0] - step[ 3]; |
| 2255 |
| 2256 temp1 = step[ 4]*C14; |
| 2257 temp2 = step[ 7]*C2; |
| 2258 temp1 -= temp2; |
| 2259 output[4] = (temp1); |
| 2260 |
| 2261 temp1 = step[ 4]*C2; |
| 2262 temp2 = step[ 7]*C14; |
| 2263 temp1 += temp2; |
| 2264 output[7] = (temp1); |
| 2265 |
| 2266 temp1 = step[ 5]*C10; |
| 2267 temp2 = step[ 6]*C6; |
| 2268 temp1 -= temp2; |
| 2269 output[5] = (temp1); |
| 2270 |
| 2271 temp1 = step[ 5]*C6; |
| 2272 temp2 = step[ 6]*C10; |
| 2273 temp1 += temp2; |
| 2274 output[6] = (temp1); |
| 2275 |
| 2276 output[8] = step[ 8] + step[11]; |
| 2277 output[9] = step[ 9] + step[10]; |
| 2278 output[10] = step[ 9] - step[10]; |
| 2279 output[11] = step[ 8] - step[11]; |
| 2280 output[12] = step[12] + step[15]; |
| 2281 output[13] = step[13] + step[14]; |
| 2282 output[14] = step[13] - step[14]; |
| 2283 output[15] = step[12] - step[15]; |
| 2284 |
| 2285 // output 4 |
| 2286 step[ 0] = output[0] + output[7]; |
| 2287 step[ 1] = output[1] + output[6]; |
| 2288 step[ 2] = output[2] + output[5]; |
| 2289 step[ 3] = output[3] + output[4]; |
| 2290 step[ 4] = output[3] - output[4]; |
| 2291 step[ 5] = output[2] - output[5]; |
| 2292 step[ 6] = output[1] - output[6]; |
| 2293 step[ 7] = output[0] - output[7]; |
| 2294 |
| 2295 temp1 = output[8]*C7; |
| 2296 temp2 = output[15]*C9; |
| 2297 temp1 -= temp2; |
| 2298 step[ 8] = (temp1); |
| 2299 |
| 2300 temp1 = output[9]*C11; |
| 2301 temp2 = output[14]*C5; |
| 2302 temp1 += temp2; |
| 2303 step[ 9] = (temp1); |
| 2304 |
| 2305 temp1 = output[10]*C3; |
| 2306 temp2 = output[13]*C13; |
| 2307 temp1 -= temp2; |
| 2308 step[10] = (temp1); |
| 2309 |
| 2310 temp1 = output[11]*C15; |
| 2311 temp2 = output[12]*C1; |
| 2312 temp1 += temp2; |
| 2313 step[11] = (temp1); |
| 2314 |
| 2315 temp1 = output[11]*C1; |
| 2316 temp2 = output[12]*C15; |
| 2317 temp2 -= temp1; |
| 2318 step[12] = (temp2); |
| 2319 |
| 2320 temp1 = output[10]*C13; |
| 2321 temp2 = output[13]*C3; |
| 2322 temp1 += temp2; |
| 2323 step[13] = (temp1); |
| 2324 |
| 2325 temp1 = output[9]*C5; |
| 2326 temp2 = output[14]*C11; |
| 2327 temp2 -= temp1; |
| 2328 step[14] = (temp2); |
| 2329 |
| 2330 temp1 = output[8]*C9; |
| 2331 temp2 = output[15]*C7; |
| 2332 temp1 += temp2; |
| 2333 step[15] = (temp1); |
| 2334 |
| 2335 // step 5 |
| 2336 output[0] = (step[0] + step[15]); |
| 2337 output[1] = (step[1] + step[14]); |
| 2338 output[2] = (step[2] + step[13]); |
| 2339 output[3] = (step[3] + step[12]); |
| 2340 output[4] = (step[4] + step[11]); |
| 2341 output[5] = (step[5] + step[10]); |
| 2342 output[6] = (step[6] + step[ 9]); |
| 2343 output[7] = (step[7] + step[ 8]); |
| 2344 |
| 2345 output[15] = (step[0] - step[15]); |
| 2346 output[14] = (step[1] - step[14]); |
| 2347 output[13] = (step[2] - step[13]); |
| 2348 output[12] = (step[3] - step[12]); |
| 2349 output[11] = (step[4] - step[11]); |
| 2350 output[10] = (step[5] - step[10]); |
| 2351 output[9] = (step[6] - step[ 9]); |
| 2352 output[8] = (step[7] - step[ 8]); |
| 2353 } |
| 2354 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2355 } |
| 2356 |
| 2357 static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch, |
| 2358 int scale) { |
| 2359 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2360 { |
| 2361 double out[16*16], out2[16*16]; |
| 2362 const int short_pitch = pitch >> 1; |
| 2363 int i, j; |
| 2364 // First transform rows |
| 2365 for (i = 0; i < 16; ++i) { |
| 2366 double temp_in[16], temp_out[16]; |
| 2367 for (j = 0; j < 16; ++j) |
| 2368 temp_in[j] = input[j + i*short_pitch]; |
| 2369 butterfly_16x16_idct_1d_f(temp_in, temp_out); |
| 2370 for (j = 0; j < 16; ++j) |
| 2371 out[j + i*16] = temp_out[j]; |
| 2372 } |
| 2373 // Then transform columns |
| 2374 for (i = 0; i < 16; ++i) { |
| 2375 double temp_in[16], temp_out[16]; |
| 2376 for (j = 0; j < 16; ++j) |
| 2377 temp_in[j] = out[j*16 + i]; |
| 2378 butterfly_16x16_idct_1d_f(temp_in, temp_out); |
| 2379 for (j = 0; j < 16; ++j) |
| 2380 out2[j*16 + i] = temp_out[j]; |
| 2381 } |
| 2382 for (i = 0; i < 16*16; ++i) |
| 2383 output[i] = round(out2[i] / (128 >> scale)); |
| 2384 } |
| 2385 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2386 } |
| 2387 |
| 2388 static void idct8_1d(double *x) { |
| 2389 int i, j; |
| 2390 double t[8]; |
| 2391 static const double idctmat[64] = { |
| 2392 0.35355339059327, 0.49039264020162, 0.46193976625564, 0.41573480615127, |
| 2393 0.35355339059327, 0.2777851165098, 0.19134171618254, 0.097545161008064, |
| 2394 0.35355339059327, 0.41573480615127, 0.19134171618254, -0.097545161008064, |
| 2395 -0.35355339059327, -0.49039264020161, -0.46193976625564, -0.2777851165098, |
| 2396 0.35355339059327, 0.2777851165098, -0.19134171618254, -0.49039264020162, |
| 2397 -0.35355339059327, 0.097545161008064, 0.46193976625564, 0.41573480615127, |
| 2398 0.35355339059327, 0.097545161008063, -0.46193976625564, -0.2777851165098, |
| 2399 0.35355339059327, 0.41573480615127, -0.19134171618254, -0.49039264020162, |
| 2400 0.35355339059327, -0.097545161008063, -0.46193976625564, 0.2777851165098, |
| 2401 0.35355339059327, -0.41573480615127, -0.19134171618255, 0.49039264020162, |
| 2402 0.35355339059327, -0.2777851165098, -0.19134171618254, 0.49039264020161, |
| 2403 -0.35355339059327, -0.097545161008064, 0.46193976625564, -0.41573480615127, |
| 2404 0.35355339059327, -0.41573480615127, 0.19134171618254, 0.097545161008065, |
| 2405 -0.35355339059327, 0.49039264020162, -0.46193976625564, 0.2777851165098, |
| 2406 0.35355339059327, -0.49039264020162, 0.46193976625564, -0.41573480615127, |
| 2407 0.35355339059327, -0.2777851165098, 0.19134171618255, -0.097545161008064 |
| 2408 }; |
| 2409 for (i = 0; i < 8; ++i) { |
| 2410 t[i] = 0; |
| 2411 for (j = 0; j < 8; ++j) |
| 2412 t[i] += idctmat[i * 8 + j] * x[j]; |
| 2413 } |
| 2414 for (i = 0; i < 8; ++i) { |
| 2415 x[i] = t[i]; |
| 2416 } |
| 2417 } |
| 2418 |
| 2419 static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch, |
| 2420 int scale) { |
| 2421 double X[8 * 8], Y[8]; |
| 2422 int i, j; |
| 2423 int shortpitch = pitch >> 1; |
| 2424 |
| 2425 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2426 { |
| 2427 for (i = 0; i < 8; i++) { |
| 2428 for (j = 0; j < 8; j++) { |
| 2429 X[i * 8 + j] = (double)coefs[i * shortpitch + j]; |
| 2430 } |
| 2431 } |
| 2432 for (i = 0; i < 8; i++) |
| 2433 idct8_1d(X + 8 * i); |
| 2434 for (i = 0; i < 8; i++) { |
| 2435 for (j = 0; j < 8; ++j) |
| 2436 Y[j] = X[i + 8 * j]; |
| 2437 idct8_1d(Y); |
| 2438 for (j = 0; j < 8; ++j) |
| 2439 X[i + 8 * j] = Y[j]; |
| 2440 } |
| 2441 for (i = 0; i < 8; i++) { |
| 2442 for (j = 0; j < 8; j++) { |
| 2443 block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale)); |
| 2444 } |
| 2445 } |
| 2446 } |
| 2447 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2448 } |
| 2449 |
| 2450 #define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n)) |
| 2451 |
| 2452 #if DWTDCT_TYPE == DWTDCT16X16_LEAN |
| 2453 |
| 2454 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { |
| 2455 // assume output is a 32x32 buffer |
| 2456 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct |
| 2457 int16_t buffer[16 * 16]; |
| 2458 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt |
| 2459 int16_t buffer2[32 * 32]; |
| 2460 // Note: pitch is in bytes, short_pitch is in short units |
| 2461 const int short_pitch = pitch >> 1; |
| 2462 int i, j; |
| 2463 |
| 2464 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2465 // argument to the idct16x16 function |
| 2466 vp9_short_idct16x16_c_f(input, buffer, pitch, |
| 2467 1 + DWT_PRECISION_BITS); |
| 2468 for (i = 0; i < 16; ++i) { |
| 2469 vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); |
| 2470 } |
| 2471 for (i = 0; i < 16; ++i) { |
| 2472 for (j = 16; j < 32; ++j) { |
| 2473 buffer2[i * 32 + j] = |
| 2474 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); |
| 2475 } |
| 2476 } |
| 2477 for (i = 16; i < 32; ++i) { |
| 2478 for (j = 0; j < 32; ++j) { |
| 2479 buffer2[i * 32 + j] = |
| 2480 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); |
| 2481 } |
| 2482 } |
| 2483 #if DWT_TYPE == 26 |
| 2484 dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); |
| 2485 #elif DWT_TYPE == 97 |
| 2486 dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); |
| 2487 #elif DWT_TYPE == 53 |
| 2488 dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); |
| 2489 #endif |
| 2490 } |
| 2491 |
| 2492 #elif DWTDCT_TYPE == DWTDCT16X16 |
| 2493 |
| 2494 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { |
| 2495 // assume output is a 32x32 buffer |
| 2496 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct |
| 2497 int16_t buffer[16 * 16]; |
| 2498 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt |
| 2499 int16_t buffer2[32 * 32]; |
| 2500 // Note: pitch is in bytes, short_pitch is in short units |
| 2501 const int short_pitch = pitch >> 1; |
| 2502 int i, j; |
| 2503 |
| 2504 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2505 // argument to the idct16x16 function |
| 2506 vp9_short_idct16x16_c_f(input, buffer, pitch, |
| 2507 1 + DWT_PRECISION_BITS); |
| 2508 for (i = 0; i < 16; ++i) { |
| 2509 vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); |
| 2510 } |
| 2511 vp9_short_idct16x16_c_f(input + 16, buffer, pitch, |
| 2512 1 + DWT_PRECISION_BITS); |
| 2513 for (i = 0; i < 16; ++i) { |
| 2514 vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16); |
| 2515 } |
| 2516 vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, |
| 2517 1 + DWT_PRECISION_BITS); |
| 2518 for (i = 0; i < 16; ++i) { |
| 2519 vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16, |
| 2520 sizeof(*buffer2) * 16); |
| 2521 } |
| 2522 vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, |
| 2523 1 + DWT_PRECISION_BITS); |
| 2524 for (i = 0; i < 16; ++i) { |
| 2525 vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16, |
| 2526 sizeof(*buffer2) * 16); |
| 2527 } |
| 2528 #if DWT_TYPE == 26 |
| 2529 dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); |
| 2530 #elif DWT_TYPE == 97 |
| 2531 dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); |
| 2532 #elif DWT_TYPE == 53 |
| 2533 dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); |
| 2534 #endif |
| 2535 } |
| 2536 |
| 2537 #elif DWTDCT_TYPE == DWTDCT8X8 |
| 2538 |
| 2539 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { |
| 2540 // assume output is a 32x32 buffer |
| 2541 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct |
| 2542 int16_t buffer[8 * 8]; |
| 2543 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt |
| 2544 int16_t buffer2[32 * 32]; |
| 2545 // Note: pitch is in bytes, short_pitch is in short units |
| 2546 const int short_pitch = pitch >> 1; |
| 2547 int i, j; |
| 2548 |
| 2549 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2550 // argument to the idct16x16 function |
| 2551 vp9_short_idct8x8_c_f(input, buffer, pitch, |
| 2552 1 + DWT_PRECISION_BITS); |
| 2553 for (i = 0; i < 8; ++i) { |
| 2554 vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8); |
| 2555 } |
| 2556 vp9_short_idct8x8_c_f(input + 8, buffer, pitch, |
| 2557 1 + DWT_PRECISION_BITS); |
| 2558 for (i = 0; i < 8; ++i) { |
| 2559 vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8); |
| 2560 } |
| 2561 vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch, |
| 2562 1 + DWT_PRECISION_BITS); |
| 2563 for (i = 0; i < 8; ++i) { |
| 2564 vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8, |
| 2565 sizeof(*buffer2) * 8); |
| 2566 } |
| 2567 vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch, |
| 2568 1 + DWT_PRECISION_BITS); |
| 2569 for (i = 0; i < 8; ++i) { |
| 2570 vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8, |
| 2571 sizeof(*buffer2) * 8); |
| 2572 } |
| 2573 for (i = 0; i < 16; ++i) { |
| 2574 for (j = 16; j < 32; ++j) { |
| 2575 buffer2[i * 32 + j] = |
| 2576 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); |
| 2577 } |
| 2578 } |
| 2579 for (i = 16; i < 32; ++i) { |
| 2580 for (j = 0; j < 32; ++j) { |
| 2581 buffer2[i * 32 + j] = |
| 2582 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); |
| 2583 } |
| 2584 } |
| 2585 #if DWT_TYPE == 26 |
| 2586 dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32); |
| 2587 #elif DWT_TYPE == 97 |
| 2588 dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32); |
| 2589 #elif DWT_TYPE == 53 |
| 2590 dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32); |
| 2591 #endif |
| 2592 } |
| 2593 |
| 2594 #endif |
| 2595 |
| 2596 #if CONFIG_TX64X64 |
| 2597 void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) { |
| 2598 // assume output is a 64x64 buffer |
| 2599 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct |
| 2600 int16_t buffer[16 * 16]; |
| 2601 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt |
| 2602 int16_t buffer2[64 * 64]; |
| 2603 // Note: pitch is in bytes, short_pitch is in short units |
| 2604 const int short_pitch = pitch >> 1; |
| 2605 int i, j; |
| 2606 |
| 2607 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2608 // argument to the idct16x16 function |
| 2609 vp9_short_idct16x16_c_f(input, buffer, pitch, |
| 2610 2 + DWT_PRECISION_BITS); |
| 2611 for (i = 0; i < 16; ++i) { |
| 2612 vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16); |
| 2613 } |
| 2614 #if DWTDCT_TYPE == DWTDCT16X16_LEAN |
| 2615 for (i = 0; i < 16; ++i) { |
| 2616 for (j = 16; j < 64; ++j) { |
| 2617 buffer2[i * 64 + j] = |
| 2618 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); |
| 2619 } |
| 2620 } |
| 2621 for (i = 16; i < 64; ++i) { |
| 2622 for (j = 0; j < 64; ++j) { |
| 2623 buffer2[i * 64 + j] = |
| 2624 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); |
| 2625 } |
| 2626 } |
| 2627 #elif DWTDCT_TYPE == DWTDCT16X16 |
| 2628 vp9_short_idct16x16_c_f(input + 16, buffer, pitch, |
| 2629 2 + DWT_PRECISION_BITS); |
| 2630 for (i = 0; i < 16; ++i) { |
| 2631 vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16); |
| 2632 } |
| 2633 vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, |
| 2634 2 + DWT_PRECISION_BITS); |
| 2635 for (i = 0; i < 16; ++i) { |
| 2636 vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16, |
| 2637 sizeof(*buffer2) * 16); |
| 2638 } |
| 2639 vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, |
| 2640 2 + DWT_PRECISION_BITS); |
| 2641 for (i = 0; i < 16; ++i) { |
| 2642 vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16, |
| 2643 sizeof(*buffer2) * 16); |
| 2644 } |
| 2645 |
| 2646 // Copying and scaling highest bands into buffer2 |
| 2647 for (i = 0; i < 32; ++i) { |
| 2648 for (j = 32; j < 64; ++j) { |
| 2649 buffer2[i * 64 + j] = |
| 2650 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); |
| 2651 } |
| 2652 } |
| 2653 for (i = 32; i < 64; ++i) { |
| 2654 for (j = 0; j < 64; ++j) { |
| 2655 buffer2[i * 64 + j] = |
| 2656 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); |
| 2657 } |
| 2658 } |
| 2659 #endif // DWTDCT_TYPE |
| 2660 |
| 2661 #if DWT_TYPE == 26 |
| 2662 dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64); |
| 2663 #elif DWT_TYPE == 97 |
| 2664 dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64); |
| 2665 #elif DWT_TYPE == 53 |
| 2666 dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64); |
| 2667 #endif |
| 2668 } |
| 2669 #endif // CONFIG_TX64X64 |
| 2670 #endif // !CONFIG_DWTDCTHYBRID |
OLD | NEW |