| Index: source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
|
| diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
|
| similarity index 95%
|
| copy from source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
|
| copy to source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
|
| index e7ca39369b3962ce1584509b2ceb4476f92600bd..6d41e6190b78c032023aa851efc9c65c71c7c8f1 100644
|
| --- a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
|
| +++ b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
|
| @@ -1,5 +1,5 @@
|
| /*
|
| - * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
|
| + * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
| *
|
| * Use of this source code is governed by a BSD-style license
|
| * that can be found in the LICENSE file in the root of the source
|
| @@ -8,21 +8,14 @@
|
| * be found in the AUTHORS file in the root of the source tree.
|
| */
|
|
|
| -#include <assert.h>
|
| -#include <stdio.h>
|
| -
|
| #include "./vpx_config.h"
|
| -#include "./vp9_rtcd.h"
|
| -#include "vp9/common/vp9_common.h"
|
| -#include "vp9/common/vp9_blockd.h"
|
| -#include "vp9/common/vp9_idct.h"
|
| -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
|
| +#include "./vpx_dsp_rtcd.h"
|
| +#include "vpx_dsp/mips/inv_txfm_dspr2.h"
|
| #include "vpx_dsp/txfm_common.h"
|
| -#include "vpx_ports/mem.h"
|
|
|
| #if HAVE_DSPR2
|
| -static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
|
| - uint32_t no_rows) {
|
| +void idct16_rows_dspr2(const int16_t *input, int16_t *output,
|
| + uint32_t no_rows) {
|
| int i;
|
| int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
|
| int step1_10, step1_11, step1_12, step1_13;
|
| @@ -406,8 +399,8 @@ static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
|
| }
|
| }
|
|
|
| -static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
|
| - int dest_stride) {
|
| +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
|
| + int dest_stride) {
|
| int i;
|
| int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
|
| int step1_8, step1_9, step1_10, step1_11;
|
| @@ -894,7 +887,7 @@ static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
|
| }
|
| }
|
|
|
| -void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
|
| +void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
|
| int dest_stride) {
|
| DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
|
| uint32_t pos = 45;
|
| @@ -913,7 +906,153 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
|
| idct16_cols_add_blk_dspr2(out, dest, dest_stride);
|
| }
|
|
|
| -static void iadst16(const int16_t *input, int16_t *output) {
|
| +void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
|
| + int dest_stride) {
|
| + DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
|
| + int16_t *outptr = out;
|
| + uint32_t i;
|
| + uint32_t pos = 45;
|
| +
|
| + /* bit positon for extract from acc */
|
| + __asm__ __volatile__ (
|
| + "wrdsp %[pos], 1 \n\t"
|
| + :
|
| + : [pos] "r" (pos)
|
| + );
|
| +
|
| + // First transform rows. Since all non-zero dct coefficients are in
|
| + // upper-left 4x4 area, we only need to calculate first 4 rows here.
|
| + idct16_rows_dspr2(input, outptr, 4);
|
| +
|
| + outptr += 4;
|
| + for (i = 0; i < 6; ++i) {
|
| + __asm__ __volatile__ (
|
| + "sw $zero, 0(%[outptr]) \n\t"
|
| + "sw $zero, 32(%[outptr]) \n\t"
|
| + "sw $zero, 64(%[outptr]) \n\t"
|
| + "sw $zero, 96(%[outptr]) \n\t"
|
| + "sw $zero, 128(%[outptr]) \n\t"
|
| + "sw $zero, 160(%[outptr]) \n\t"
|
| + "sw $zero, 192(%[outptr]) \n\t"
|
| + "sw $zero, 224(%[outptr]) \n\t"
|
| + "sw $zero, 256(%[outptr]) \n\t"
|
| + "sw $zero, 288(%[outptr]) \n\t"
|
| + "sw $zero, 320(%[outptr]) \n\t"
|
| + "sw $zero, 352(%[outptr]) \n\t"
|
| + "sw $zero, 384(%[outptr]) \n\t"
|
| + "sw $zero, 416(%[outptr]) \n\t"
|
| + "sw $zero, 448(%[outptr]) \n\t"
|
| + "sw $zero, 480(%[outptr]) \n\t"
|
| +
|
| + :
|
| + : [outptr] "r" (outptr)
|
| + );
|
| +
|
| + outptr += 2;
|
| + }
|
| +
|
| + // Then transform columns
|
| + idct16_cols_add_blk_dspr2(out, dest, dest_stride);
|
| +}
|
| +
|
| +void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
|
| + int dest_stride) {
|
| + uint32_t pos = 45;
|
| + int32_t out;
|
| + int32_t r;
|
| + int32_t a1, absa1;
|
| + int32_t vector_a1;
|
| + int32_t t1, t2, t3, t4;
|
| + int32_t vector_1, vector_2, vector_3, vector_4;
|
| +
|
| + /* bit positon for extract from acc */
|
| + __asm__ __volatile__ (
|
| + "wrdsp %[pos], 1 \n\t"
|
| +
|
| + :
|
| + : [pos] "r" (pos)
|
| + );
|
| +
|
| + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
|
| + __asm__ __volatile__ (
|
| + "addi %[out], %[out], 32 \n\t"
|
| + "sra %[a1], %[out], 6 \n\t"
|
| +
|
| + : [out] "+r" (out), [a1] "=r" (a1)
|
| + :
|
| + );
|
| +
|
| + if (a1 < 0) {
|
| + /* use quad-byte
|
| + * input and output memory are four byte aligned */
|
| + __asm__ __volatile__ (
|
| + "abs %[absa1], %[a1] \n\t"
|
| + "replv.qb %[vector_a1], %[absa1] \n\t"
|
| +
|
| + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
|
| + : [a1] "r" (a1)
|
| + );
|
| +
|
| + for (r = 16; r--;) {
|
| + __asm__ __volatile__ (
|
| + "lw %[t1], 0(%[dest]) \n\t"
|
| + "lw %[t2], 4(%[dest]) \n\t"
|
| + "lw %[t3], 8(%[dest]) \n\t"
|
| + "lw %[t4], 12(%[dest]) \n\t"
|
| + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
|
| + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
|
| + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
|
| + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
|
| + "sw %[vector_1], 0(%[dest]) \n\t"
|
| + "sw %[vector_2], 4(%[dest]) \n\t"
|
| + "sw %[vector_3], 8(%[dest]) \n\t"
|
| + "sw %[vector_4], 12(%[dest]) \n\t"
|
| + "add %[dest], %[dest], %[dest_stride] \n\t"
|
| +
|
| + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
|
| + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
|
| + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
|
| + [dest] "+&r" (dest)
|
| + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
|
| + );
|
| + }
|
| + } else {
|
| + /* use quad-byte
|
| + * input and output memory are four byte aligned */
|
| + __asm__ __volatile__ (
|
| + "replv.qb %[vector_a1], %[a1] \n\t"
|
| +
|
| + : [vector_a1] "=r" (vector_a1)
|
| + : [a1] "r" (a1)
|
| + );
|
| +
|
| + for (r = 16; r--;) {
|
| + __asm__ __volatile__ (
|
| + "lw %[t1], 0(%[dest]) \n\t"
|
| + "lw %[t2], 4(%[dest]) \n\t"
|
| + "lw %[t3], 8(%[dest]) \n\t"
|
| + "lw %[t4], 12(%[dest]) \n\t"
|
| + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
|
| + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
|
| + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
|
| + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
|
| + "sw %[vector_1], 0(%[dest]) \n\t"
|
| + "sw %[vector_2], 4(%[dest]) \n\t"
|
| + "sw %[vector_3], 8(%[dest]) \n\t"
|
| + "sw %[vector_4], 12(%[dest]) \n\t"
|
| + "add %[dest], %[dest], %[dest_stride] \n\t"
|
| +
|
| + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
|
| + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
|
| + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
|
| + [dest] "+&r" (dest)
|
| + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
|
| + );
|
| + }
|
| + }
|
| +}
|
| +
|
| +void iadst16_dspr2(const int16_t *input, int16_t *output) {
|
| int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
|
|
|
| int x0 = input[15];
|
| @@ -1084,234 +1223,5 @@ static void iadst16(const int16_t *input, int16_t *output) {
|
| output[15] = -x1;
|
| }
|
|
|
| -void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
|
| - int pitch, int tx_type) {
|
| - int i, j;
|
| - DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
|
| - int16_t *outptr = out;
|
| - int16_t temp_out[16];
|
| - uint32_t pos = 45;
|
| -
|
| - /* bit positon for extract from acc */
|
| - __asm__ __volatile__ (
|
| - "wrdsp %[pos], 1 \n\t"
|
| - :
|
| - : [pos] "r" (pos)
|
| - );
|
| -
|
| - switch (tx_type) {
|
| - case DCT_DCT: // DCT in both horizontal and vertical
|
| - idct16_rows_dspr2(input, outptr, 16);
|
| - idct16_cols_add_blk_dspr2(out, dest, pitch);
|
| - break;
|
| - case ADST_DCT: // ADST in vertical, DCT in horizontal
|
| - idct16_rows_dspr2(input, outptr, 16);
|
| -
|
| - outptr = out;
|
| -
|
| - for (i = 0; i < 16; ++i) {
|
| - iadst16(outptr, temp_out);
|
| -
|
| - for (j = 0; j < 16; ++j)
|
| - dest[j * pitch + i] =
|
| - clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
|
| - + dest[j * pitch + i]);
|
| - outptr += 16;
|
| - }
|
| - break;
|
| - case DCT_ADST: // DCT in vertical, ADST in horizontal
|
| - {
|
| - int16_t temp_in[16 * 16];
|
| -
|
| - for (i = 0; i < 16; ++i) {
|
| - /* prefetch row */
|
| - prefetch_load((const uint8_t *)(input + 16));
|
| -
|
| - iadst16(input, outptr);
|
| - input += 16;
|
| - outptr += 16;
|
| - }
|
| -
|
| - for (i = 0; i < 16; ++i)
|
| - for (j = 0; j < 16; ++j)
|
| - temp_in[j * 16 + i] = out[i * 16 + j];
|
| -
|
| - idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
|
| - }
|
| - break;
|
| - case ADST_ADST: // ADST in both directions
|
| - {
|
| - int16_t temp_in[16];
|
| -
|
| - for (i = 0; i < 16; ++i) {
|
| - /* prefetch row */
|
| - prefetch_load((const uint8_t *)(input + 16));
|
| -
|
| - iadst16(input, outptr);
|
| - input += 16;
|
| - outptr += 16;
|
| - }
|
| -
|
| - for (i = 0; i < 16; ++i) {
|
| - for (j = 0; j < 16; ++j)
|
| - temp_in[j] = out[j * 16 + i];
|
| - iadst16(temp_in, temp_out);
|
| - for (j = 0; j < 16; ++j)
|
| - dest[j * pitch + i] =
|
| - clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
|
| - + dest[j * pitch + i]);
|
| - }
|
| - }
|
| - break;
|
| - default:
|
| - printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
|
| - break;
|
| - }
|
| -}
|
| -
|
| -void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
|
| - int dest_stride) {
|
| - DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
|
| - int16_t *outptr = out;
|
| - uint32_t i;
|
| - uint32_t pos = 45;
|
| -
|
| - /* bit positon for extract from acc */
|
| - __asm__ __volatile__ (
|
| - "wrdsp %[pos], 1 \n\t"
|
| - :
|
| - : [pos] "r" (pos)
|
| - );
|
| -
|
| - // First transform rows. Since all non-zero dct coefficients are in
|
| - // upper-left 4x4 area, we only need to calculate first 4 rows here.
|
| - idct16_rows_dspr2(input, outptr, 4);
|
| -
|
| - outptr += 4;
|
| - for (i = 0; i < 6; ++i) {
|
| - __asm__ __volatile__ (
|
| - "sw $zero, 0(%[outptr]) \n\t"
|
| - "sw $zero, 32(%[outptr]) \n\t"
|
| - "sw $zero, 64(%[outptr]) \n\t"
|
| - "sw $zero, 96(%[outptr]) \n\t"
|
| - "sw $zero, 128(%[outptr]) \n\t"
|
| - "sw $zero, 160(%[outptr]) \n\t"
|
| - "sw $zero, 192(%[outptr]) \n\t"
|
| - "sw $zero, 224(%[outptr]) \n\t"
|
| - "sw $zero, 256(%[outptr]) \n\t"
|
| - "sw $zero, 288(%[outptr]) \n\t"
|
| - "sw $zero, 320(%[outptr]) \n\t"
|
| - "sw $zero, 352(%[outptr]) \n\t"
|
| - "sw $zero, 384(%[outptr]) \n\t"
|
| - "sw $zero, 416(%[outptr]) \n\t"
|
| - "sw $zero, 448(%[outptr]) \n\t"
|
| - "sw $zero, 480(%[outptr]) \n\t"
|
| -
|
| - :
|
| - : [outptr] "r" (outptr)
|
| - );
|
| -
|
| - outptr += 2;
|
| - }
|
| -
|
| - // Then transform columns
|
| - idct16_cols_add_blk_dspr2(out, dest, dest_stride);
|
| -}
|
| -
|
| -void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
|
| - int dest_stride) {
|
| - uint32_t pos = 45;
|
| - int32_t out;
|
| - int32_t r;
|
| - int32_t a1, absa1;
|
| - int32_t vector_a1;
|
| - int32_t t1, t2, t3, t4;
|
| - int32_t vector_1, vector_2, vector_3, vector_4;
|
| -
|
| - /* bit positon for extract from acc */
|
| - __asm__ __volatile__ (
|
| - "wrdsp %[pos], 1 \n\t"
|
| -
|
| - :
|
| - : [pos] "r" (pos)
|
| - );
|
| -
|
| - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
|
| - __asm__ __volatile__ (
|
| - "addi %[out], %[out], 32 \n\t"
|
| - "sra %[a1], %[out], 6 \n\t"
|
| -
|
| - : [out] "+r" (out), [a1] "=r" (a1)
|
| - :
|
| - );
|
| -
|
| - if (a1 < 0) {
|
| - /* use quad-byte
|
| - * input and output memory are four byte aligned */
|
| - __asm__ __volatile__ (
|
| - "abs %[absa1], %[a1] \n\t"
|
| - "replv.qb %[vector_a1], %[absa1] \n\t"
|
| -
|
| - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
|
| - : [a1] "r" (a1)
|
| - );
|
| -
|
| - for (r = 16; r--;) {
|
| - __asm__ __volatile__ (
|
| - "lw %[t1], 0(%[dest]) \n\t"
|
| - "lw %[t2], 4(%[dest]) \n\t"
|
| - "lw %[t3], 8(%[dest]) \n\t"
|
| - "lw %[t4], 12(%[dest]) \n\t"
|
| - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
|
| - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
|
| - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
|
| - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
|
| - "sw %[vector_1], 0(%[dest]) \n\t"
|
| - "sw %[vector_2], 4(%[dest]) \n\t"
|
| - "sw %[vector_3], 8(%[dest]) \n\t"
|
| - "sw %[vector_4], 12(%[dest]) \n\t"
|
| - "add %[dest], %[dest], %[dest_stride] \n\t"
|
| -
|
| - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
|
| - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
|
| - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
|
| - [dest] "+&r" (dest)
|
| - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
|
| - );
|
| - }
|
| - } else {
|
| - /* use quad-byte
|
| - * input and output memory are four byte aligned */
|
| - __asm__ __volatile__ (
|
| - "replv.qb %[vector_a1], %[a1] \n\t"
|
| -
|
| - : [vector_a1] "=r" (vector_a1)
|
| - : [a1] "r" (a1)
|
| - );
|
| -
|
| - for (r = 16; r--;) {
|
| - __asm__ __volatile__ (
|
| - "lw %[t1], 0(%[dest]) \n\t"
|
| - "lw %[t2], 4(%[dest]) \n\t"
|
| - "lw %[t3], 8(%[dest]) \n\t"
|
| - "lw %[t4], 12(%[dest]) \n\t"
|
| - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
|
| - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
|
| - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
|
| - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
|
| - "sw %[vector_1], 0(%[dest]) \n\t"
|
| - "sw %[vector_2], 4(%[dest]) \n\t"
|
| - "sw %[vector_3], 8(%[dest]) \n\t"
|
| - "sw %[vector_4], 12(%[dest]) \n\t"
|
| - "add %[dest], %[dest], %[dest_stride] \n\t"
|
|
|
| - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
|
| - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
|
| - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
|
| - [dest] "+&r" (dest)
|
| - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
|
| - );
|
| - }
|
| - }
|
| -}
|
| -#endif // #if HAVE_DSPR2
|
| +#endif // HAVE_DSPR2
|
|
|