| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 #include "./vpx_config.h" | 11 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" | 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
| 17 #include "vp9/common/vp9_blockd.h" | |
| 18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
| 19 #include "vpx_dsp/txfm_common.h" | 14 #include "vpx_dsp/txfm_common.h" |
| 20 #include "vpx_ports/mem.h" | |
| 21 | 15 |
| 22 #if HAVE_DSPR2 | 16 #if HAVE_DSPR2 |
| 23 static void idct8_rows_dspr2(const int16_t *input, int16_t *output, | 17 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { |
| 24 uint32_t no_rows) { | |
| 25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 18 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
| 26 const int const_2_power_13 = 8192; | 19 const int const_2_power_13 = 8192; |
| 27 int Temp0, Temp1, Temp2, Temp3, Temp4; | 20 int Temp0, Temp1, Temp2, Temp3, Temp4; |
| 28 int i; | 21 int i; |
| 29 | 22 |
| 30 for (i = no_rows; i--; ) { | 23 for (i = no_rows; i--; ) { |
| 31 __asm__ __volatile__ ( | 24 __asm__ __volatile__ ( |
| 32 /* | 25 /* |
| 33 temp_1 = (input[0] + input[4]) * cospi_16_64; | 26 temp_1 = (input[0] + input[4]) * cospi_16_64; |
| 34 step2_0 = dct_const_round_shift(temp_1); | 27 step2_0 = dct_const_round_shift(temp_1); |
| (...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 194 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | 187 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), |
| 195 [cospi_24_64] "r" (cospi_24_64), | 188 [cospi_24_64] "r" (cospi_24_64), |
| 196 [output] "r" (output), [input] "r" (input) | 189 [output] "r" (output), [input] "r" (input) |
| 197 ); | 190 ); |
| 198 | 191 |
| 199 input += 8; | 192 input += 8; |
| 200 output += 1; | 193 output += 1; |
| 201 } | 194 } |
| 202 } | 195 } |
| 203 | 196 |
| 204 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, | 197 void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, |
| 205 int dest_stride) { | 198 int dest_stride) { |
| 206 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 199 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
| 207 int Temp0, Temp1, Temp2, Temp3; | 200 int Temp0, Temp1, Temp2, Temp3; |
| 208 int i; | 201 int i; |
| 209 const int const_2_power_13 = 8192; | 202 const int const_2_power_13 = 8192; |
| 210 uint8_t *dest_pix; | 203 uint8_t *dest_pix; |
| 211 uint8_t *cm = vpx_ff_cropTbl; | 204 uint8_t *cm = vpx_ff_cropTbl; |
| 212 | 205 |
| 213 /* prefetch vpx_ff_cropTbl */ | 206 /* prefetch vpx_ff_cropTbl */ |
| 214 prefetch_load(vpx_ff_cropTbl); | 207 prefetch_load(vpx_ff_cropTbl); |
| 215 prefetch_load(vpx_ff_cropTbl + 32); | 208 prefetch_load(vpx_ff_cropTbl + 32); |
| (...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 442 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), | 435 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), |
| 443 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | 436 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), |
| 444 [cospi_24_64] "r" (cospi_24_64), | 437 [cospi_24_64] "r" (cospi_24_64), |
| 445 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) | 438 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) |
| 446 ); | 439 ); |
| 447 | 440 |
| 448 input += 8; | 441 input += 8; |
| 449 } | 442 } |
| 450 } | 443 } |
| 451 | 444 |
| 452 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | 445 void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, |
| 453 int dest_stride) { | 446 int dest_stride) { |
| 454 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | 447 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); |
| 455 int16_t *outptr = out; | 448 int16_t *outptr = out; |
| 456 uint32_t pos = 45; | 449 uint32_t pos = 45; |
| 457 | 450 |
| 458 /* bit positon for extract from acc */ | 451 /* bit positon for extract from acc */ |
| 459 __asm__ __volatile__ ( | 452 __asm__ __volatile__ ( |
| 460 "wrdsp %[pos], 1 \n\t" | 453 "wrdsp %[pos], 1 \n\t" |
| 461 : | 454 : |
| 462 : [pos] "r" (pos) | 455 : [pos] "r" (pos) |
| 463 ); | 456 ); |
| 464 | 457 |
| 465 // First transform rows | 458 // First transform rows |
| 466 idct8_rows_dspr2(input, outptr, 8); | 459 idct8_rows_dspr2(input, outptr, 8); |
| 467 | 460 |
| 468 // Then transform columns and add to dest | 461 // Then transform columns and add to dest |
| 469 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | 462 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); |
| 470 } | 463 } |
| 471 | 464 |
| 472 static void iadst8_dspr2(const int16_t *input, int16_t *output) { | 465 void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, |
| 473 int s0, s1, s2, s3, s4, s5, s6, s7; | |
| 474 int x0, x1, x2, x3, x4, x5, x6, x7; | |
| 475 | |
| 476 x0 = input[7]; | |
| 477 x1 = input[0]; | |
| 478 x2 = input[5]; | |
| 479 x3 = input[2]; | |
| 480 x4 = input[3]; | |
| 481 x5 = input[4]; | |
| 482 x6 = input[1]; | |
| 483 x7 = input[6]; | |
| 484 | |
| 485 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { | |
| 486 output[0] = output[1] = output[2] = output[3] = output[4] | |
| 487 = output[5] = output[6] = output[7] = 0; | |
| 488 return; | |
| 489 } | |
| 490 | |
| 491 // stage 1 | |
| 492 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | |
| 493 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | |
| 494 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | |
| 495 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | |
| 496 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | |
| 497 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | |
| 498 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; | |
| 499 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; | |
| 500 | |
| 501 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); | |
| 502 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); | |
| 503 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); | |
| 504 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); | |
| 505 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); | |
| 506 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); | |
| 507 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); | |
| 508 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); | |
| 509 | |
| 510 // stage 2 | |
| 511 s0 = x0; | |
| 512 s1 = x1; | |
| 513 s2 = x2; | |
| 514 s3 = x3; | |
| 515 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; | |
| 516 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; | |
| 517 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; | |
| 518 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; | |
| 519 | |
| 520 x0 = s0 + s2; | |
| 521 x1 = s1 + s3; | |
| 522 x2 = s0 - s2; | |
| 523 x3 = s1 - s3; | |
| 524 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); | |
| 525 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); | |
| 526 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); | |
| 527 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); | |
| 528 | |
| 529 // stage 3 | |
| 530 s2 = cospi_16_64 * (x2 + x3); | |
| 531 s3 = cospi_16_64 * (x2 - x3); | |
| 532 s6 = cospi_16_64 * (x6 + x7); | |
| 533 s7 = cospi_16_64 * (x6 - x7); | |
| 534 | |
| 535 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); | |
| 536 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); | |
| 537 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); | |
| 538 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); | |
| 539 | |
| 540 output[0] = x0; | |
| 541 output[1] = -x4; | |
| 542 output[2] = x6; | |
| 543 output[3] = -x2; | |
| 544 output[4] = x3; | |
| 545 output[5] = -x7; | |
| 546 output[6] = x5; | |
| 547 output[7] = -x1; | |
| 548 } | |
| 549 | |
| 550 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 551 int dest_stride, int tx_type) { | |
| 552 int i, j; | |
| 553 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | |
| 554 int16_t *outptr = out; | |
| 555 int16_t temp_in[8 * 8], temp_out[8]; | |
| 556 uint32_t pos = 45; | |
| 557 | |
| 558 /* bit positon for extract from acc */ | |
| 559 __asm__ __volatile__ ( | |
| 560 "wrdsp %[pos], 1 \n\t" | |
| 561 : | |
| 562 : [pos] "r" (pos) | |
| 563 ); | |
| 564 | |
| 565 switch (tx_type) { | |
| 566 case DCT_DCT: // DCT in both horizontal and vertical | |
| 567 idct8_rows_dspr2(input, outptr, 8); | |
| 568 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
| 569 break; | |
| 570 case ADST_DCT: // ADST in vertical, DCT in horizontal | |
| 571 idct8_rows_dspr2(input, outptr, 8); | |
| 572 | |
| 573 for (i = 0; i < 8; ++i) { | |
| 574 iadst8_dspr2(&out[i * 8], temp_out); | |
| 575 | |
| 576 for (j = 0; j < 8; ++j) | |
| 577 dest[j * dest_stride + i] = | |
| 578 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | |
| 579 + dest[j * dest_stride + i]); | |
| 580 } | |
| 581 break; | |
| 582 case DCT_ADST: // DCT in vertical, ADST in horizontal | |
| 583 for (i = 0; i < 8; ++i) { | |
| 584 iadst8_dspr2(input, outptr); | |
| 585 input += 8; | |
| 586 outptr += 8; | |
| 587 } | |
| 588 | |
| 589 for (i = 0; i < 8; ++i) { | |
| 590 for (j = 0; j < 8; ++j) { | |
| 591 temp_in[i * 8 + j] = out[j * 8 + i]; | |
| 592 } | |
| 593 } | |
| 594 idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); | |
| 595 break; | |
| 596 case ADST_ADST: // ADST in both directions | |
| 597 for (i = 0; i < 8; ++i) { | |
| 598 iadst8_dspr2(input, outptr); | |
| 599 input += 8; | |
| 600 outptr += 8; | |
| 601 } | |
| 602 | |
| 603 for (i = 0; i < 8; ++i) { | |
| 604 for (j = 0; j < 8; ++j) | |
| 605 temp_in[j] = out[j * 8 + i]; | |
| 606 | |
| 607 iadst8_dspr2(temp_in, temp_out); | |
| 608 | |
| 609 for (j = 0; j < 8; ++j) | |
| 610 dest[j * dest_stride + i] = | |
| 611 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | |
| 612 + dest[j * dest_stride + i]); | |
| 613 } | |
| 614 break; | |
| 615 default: | |
| 616 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); | |
| 617 break; | |
| 618 } | |
| 619 } | |
| 620 | |
| 621 void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 622 int dest_stride) { | 466 int dest_stride) { |
| 623 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | 467 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); |
| 624 int16_t *outptr = out; | 468 int16_t *outptr = out; |
| 625 uint32_t pos = 45; | 469 uint32_t pos = 45; |
| 626 | 470 |
| 627 /* bit positon for extract from acc */ | 471 /* bit positon for extract from acc */ |
| 628 __asm__ __volatile__ ( | 472 __asm__ __volatile__ ( |
| 629 "wrdsp %[pos], 1 \n\t" | 473 "wrdsp %[pos], 1 \n\t" |
| 630 : | 474 : |
| 631 : [pos] "r" (pos) | 475 : [pos] "r" (pos) |
| (...skipping 24 matching lines...) Expand all Loading... |
| 656 | 500 |
| 657 : | 501 : |
| 658 : [outptr] "r" (outptr) | 502 : [outptr] "r" (outptr) |
| 659 ); | 503 ); |
| 660 | 504 |
| 661 | 505 |
| 662 // Then transform columns and add to dest | 506 // Then transform columns and add to dest |
| 663 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | 507 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); |
| 664 } | 508 } |
| 665 | 509 |
| 666 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, | 510 void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, |
| 667 int dest_stride) { | 511 int dest_stride) { |
| 668 uint32_t pos = 45; | 512 uint32_t pos = 45; |
| 669 int32_t out; | 513 int32_t out; |
| 670 int32_t r; | 514 int32_t r; |
| 671 int32_t a1, absa1; | 515 int32_t a1, absa1; |
| 672 int32_t t1, t2, vector_a1, vector_1, vector_2; | 516 int32_t t1, t2, vector_a1, vector_1, vector_2; |
| 673 | 517 |
| 674 /* bit positon for extract from acc */ | 518 /* bit positon for extract from acc */ |
| 675 __asm__ __volatile__ ( | 519 __asm__ __volatile__ ( |
| 676 "wrdsp %[pos], 1 \n\t" | 520 "wrdsp %[pos], 1 \n\t" |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 736 "add %[dest], %[dest], %[dest_stride] \n\t" | 580 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 737 | 581 |
| 738 : [t1] "=&r" (t1), [t2] "=&r" (t2), | 582 : [t1] "=&r" (t1), [t2] "=&r" (t2), |
| 739 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | 583 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 740 [dest] "+r" (dest) | 584 [dest] "+r" (dest) |
| 741 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | 585 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 742 ); | 586 ); |
| 743 } | 587 } |
| 744 } | 588 } |
| 745 } | 589 } |
| 746 #endif // #if HAVE_DSPR2 | 590 |
| 591 void iadst8_dspr2(const int16_t *input, int16_t *output) { |
| 592 int s0, s1, s2, s3, s4, s5, s6, s7; |
| 593 int x0, x1, x2, x3, x4, x5, x6, x7; |
| 594 |
| 595 x0 = input[7]; |
| 596 x1 = input[0]; |
| 597 x2 = input[5]; |
| 598 x3 = input[2]; |
| 599 x4 = input[3]; |
| 600 x5 = input[4]; |
| 601 x6 = input[1]; |
| 602 x7 = input[6]; |
| 603 |
| 604 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { |
| 605 output[0] = output[1] = output[2] = output[3] = output[4] |
| 606 = output[5] = output[6] = output[7] = 0; |
| 607 return; |
| 608 } |
| 609 |
| 610 // stage 1 |
| 611 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
| 612 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
| 613 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
| 614 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
| 615 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
| 616 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
| 617 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
| 618 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
| 619 |
| 620 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); |
| 621 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); |
| 622 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); |
| 623 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); |
| 624 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); |
| 625 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); |
| 626 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); |
| 627 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); |
| 628 |
| 629 // stage 2 |
| 630 s0 = x0; |
| 631 s1 = x1; |
| 632 s2 = x2; |
| 633 s3 = x3; |
| 634 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; |
| 635 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; |
| 636 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; |
| 637 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; |
| 638 |
| 639 x0 = s0 + s2; |
| 640 x1 = s1 + s3; |
| 641 x2 = s0 - s2; |
| 642 x3 = s1 - s3; |
| 643 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); |
| 644 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); |
| 645 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); |
| 646 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); |
| 647 |
| 648 // stage 3 |
| 649 s2 = cospi_16_64 * (x2 + x3); |
| 650 s3 = cospi_16_64 * (x2 - x3); |
| 651 s6 = cospi_16_64 * (x6 + x7); |
| 652 s7 = cospi_16_64 * (x6 - x7); |
| 653 |
| 654 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); |
| 655 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); |
| 656 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); |
| 657 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); |
| 658 |
| 659 output[0] = x0; |
| 660 output[1] = -x4; |
| 661 output[2] = x6; |
| 662 output[3] = -x2; |
| 663 output[4] = x3; |
| 664 output[5] = -x7; |
| 665 output[6] = x5; |
| 666 output[7] = -x1; |
| 667 } |
| 668 #endif // HAVE_DSPR2 |
| OLD | NEW |