source/libvpx/vpx_dsp/mips/itrans16_dspr2.c - Issue 1302353004: libvpx: Pull from upstream

Unified Diff: source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

similarity index 95%

copy from source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

copy to source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

index e7ca39369b3962ce1584509b2ceb4476f92600bd..6d41e6190b78c032023aa851efc9c65c71c7c8f1 100644

--- a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

+++ b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

@@ -1,5 +1,5 @@

* Use of this source code is governed by a BSD-style license

* that can be found in the LICENSE file in the root of the source

@@ -8,21 +8,14 @@

* be found in the AUTHORS file in the root of the source tree.

-#include <assert.h>

-#include <stdio.h>

#include "./vpx_config.h"

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/common/vp9_idct.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

#include "vpx_dsp/txfm_common.h"

-#include "vpx_ports/mem.h"

#if HAVE_DSPR2

-static void idct16_rows_dspr2(const int16_t *input, int16_t *output,

- uint32_t no_rows) {

+void idct16_rows_dspr2(const int16_t *input, int16_t *output,

+ uint32_t no_rows) {

int i;

int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

int step1_10, step1_11, step1_12, step1_13;

@@ -406,8 +399,8 @@ static void idct16_rows_dspr2(const int16_t *input, int16_t *output,

}

-static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

- int dest_stride) {

+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

+ int dest_stride) {

int i;

int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

int step1_8, step1_9, step1_10, step1_11;

@@ -894,7 +887,7 @@ static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

}

-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

+void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

int dest_stride) {

DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

uint32_t pos = 45;

@@ -913,7 +906,153 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

idct16_cols_add_blk_dspr2(out, dest, dest_stride);

}

-static void iadst16(const int16_t *input, int16_t *output) {

+void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,

+ int dest_stride) {

+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

+ int16_t *outptr = out;

+ uint32_t i;

+ uint32_t pos = 45;

+ /* bit positon for extract from acc */

+ __asm__ __volatile__ (

+ "wrdsp %[pos], 1 \n\t"

+ :

+ : [pos] "r" (pos)

+ );

+ // First transform rows. Since all non-zero dct coefficients are in

+ // upper-left 4x4 area, we only need to calculate first 4 rows here.

+ idct16_rows_dspr2(input, outptr, 4);

+ outptr += 4;

+ for (i = 0; i < 6; ++i) {

+ __asm__ __volatile__ (

+ "sw $zero, 0(%[outptr]) \n\t"

+ "sw $zero, 32(%[outptr]) \n\t"

+ "sw $zero, 64(%[outptr]) \n\t"

+ "sw $zero, 96(%[outptr]) \n\t"

+ "sw $zero, 128(%[outptr]) \n\t"

+ "sw $zero, 160(%[outptr]) \n\t"

+ "sw $zero, 192(%[outptr]) \n\t"

+ "sw $zero, 224(%[outptr]) \n\t"

+ "sw $zero, 256(%[outptr]) \n\t"

+ "sw $zero, 288(%[outptr]) \n\t"

+ "sw $zero, 320(%[outptr]) \n\t"

+ "sw $zero, 352(%[outptr]) \n\t"

+ "sw $zero, 384(%[outptr]) \n\t"

+ "sw $zero, 416(%[outptr]) \n\t"

+ "sw $zero, 448(%[outptr]) \n\t"

+ "sw $zero, 480(%[outptr]) \n\t"

+ :

+ : [outptr] "r" (outptr)

+ );

+ outptr += 2;

+ }

+ // Then transform columns

+ idct16_cols_add_blk_dspr2(out, dest, dest_stride);

+void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,

+ int dest_stride) {

+ uint32_t pos = 45;

+ int32_t out;

+ int32_t r;

+ int32_t a1, absa1;

+ int32_t vector_a1;

+ int32_t t1, t2, t3, t4;

+ int32_t vector_1, vector_2, vector_3, vector_4;

+ /* bit positon for extract from acc */

+ __asm__ __volatile__ (

+ "wrdsp %[pos], 1 \n\t"

+ :

+ : [pos] "r" (pos)

+ );

+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

+ __asm__ __volatile__ (

+ "addi %[out], %[out], 32 \n\t"

+ "sra %[a1], %[out], 6 \n\t"

+ : [out] "+r" (out), [a1] "=r" (a1)

+ :

+ );

+ if (a1 < 0) {

+ /* use quad-byte

+ * input and output memory are four byte aligned */

+ __asm__ __volatile__ (

+ "abs %[absa1], %[a1] \n\t"

+ "replv.qb %[vector_a1], %[absa1] \n\t"

+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

+ : [a1] "r" (a1)

+ );

+ for (r = 16; r--;) {

+ __asm__ __volatile__ (

+ "lw %[t1], 0(%[dest]) \n\t"

+ "lw %[t2], 4(%[dest]) \n\t"

+ "lw %[t3], 8(%[dest]) \n\t"

+ "lw %[t4], 12(%[dest]) \n\t"

+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

+ "sw %[vector_1], 0(%[dest]) \n\t"

+ "sw %[vector_2], 4(%[dest]) \n\t"

+ "sw %[vector_3], 8(%[dest]) \n\t"

+ "sw %[vector_4], 12(%[dest]) \n\t"

+ "add %[dest], %[dest], %[dest_stride] \n\t"

+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+ [dest] "+&r" (dest)

+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+ );

+ }

+ } else {

+ /* use quad-byte

+ * input and output memory are four byte aligned */

+ __asm__ __volatile__ (

+ "replv.qb %[vector_a1], %[a1] \n\t"

+ : [vector_a1] "=r" (vector_a1)

+ : [a1] "r" (a1)

+ );

+ for (r = 16; r--;) {

+ __asm__ __volatile__ (

+ "lw %[t1], 0(%[dest]) \n\t"

+ "lw %[t2], 4(%[dest]) \n\t"

+ "lw %[t3], 8(%[dest]) \n\t"

+ "lw %[t4], 12(%[dest]) \n\t"

+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

+ "sw %[vector_1], 0(%[dest]) \n\t"

+ "sw %[vector_2], 4(%[dest]) \n\t"

+ "sw %[vector_3], 8(%[dest]) \n\t"

+ "sw %[vector_4], 12(%[dest]) \n\t"

+ "add %[dest], %[dest], %[dest_stride] \n\t"

+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+ [dest] "+&r" (dest)

+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+ );

+ }

+void iadst16_dspr2(const int16_t *input, int16_t *output) {

int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

int x0 = input[15];

@@ -1084,234 +1223,5 @@ static void iadst16(const int16_t *input, int16_t *output) {

output[15] = -x1;

}

-void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

- int pitch, int tx_type) {

- int i, j;

- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

- int16_t *outptr = out;

- int16_t temp_out[16];

- uint32_t pos = 45;

- /* bit positon for extract from acc */

- __asm__ __volatile__ (

- "wrdsp %[pos], 1 \n\t"

- :

- : [pos] "r" (pos)

- );

- switch (tx_type) {

- case DCT_DCT: // DCT in both horizontal and vertical

- idct16_rows_dspr2(input, outptr, 16);

- idct16_cols_add_blk_dspr2(out, dest, pitch);

- break;

- case ADST_DCT: // ADST in vertical, DCT in horizontal

- idct16_rows_dspr2(input, outptr, 16);

- outptr = out;

- for (i = 0; i < 16; ++i) {

- iadst16(outptr, temp_out);

- for (j = 0; j < 16; ++j)

- dest[j * pitch + i] =

- clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

- + dest[j * pitch + i]);

- outptr += 16;

- }

- break;

- case DCT_ADST: // DCT in vertical, ADST in horizontal

- {

- int16_t temp_in[16 * 16];

- for (i = 0; i < 16; ++i) {

- /* prefetch row */

- prefetch_load((const uint8_t *)(input + 16));

- iadst16(input, outptr);

- input += 16;

- outptr += 16;

- }

- for (i = 0; i < 16; ++i)

- for (j = 0; j < 16; ++j)

- temp_in[j * 16 + i] = out[i * 16 + j];

- idct16_cols_add_blk_dspr2(temp_in, dest, pitch);

- }

- break;

- case ADST_ADST: // ADST in both directions

- {

- int16_t temp_in[16];

- for (i = 0; i < 16; ++i) {

- /* prefetch row */

- prefetch_load((const uint8_t *)(input + 16));

- iadst16(input, outptr);

- input += 16;

- outptr += 16;

- }

- for (i = 0; i < 16; ++i) {

- for (j = 0; j < 16; ++j)

- temp_in[j] = out[j * 16 + i];

- iadst16(temp_in, temp_out);

- for (j = 0; j < 16; ++j)

- dest[j * pitch + i] =

- clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

- + dest[j * pitch + i]);

- }

- break;

- default:

- printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");

- break;

- }

-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,

- int dest_stride) {

- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

- int16_t *outptr = out;

- uint32_t i;

- uint32_t pos = 45;

- /* bit positon for extract from acc */

- __asm__ __volatile__ (

- "wrdsp %[pos], 1 \n\t"

- :

- : [pos] "r" (pos)

- );

- // First transform rows. Since all non-zero dct coefficients are in

- // upper-left 4x4 area, we only need to calculate first 4 rows here.

- idct16_rows_dspr2(input, outptr, 4);

- outptr += 4;

- for (i = 0; i < 6; ++i) {

- __asm__ __volatile__ (

- "sw $zero, 0(%[outptr]) \n\t"

- "sw $zero, 32(%[outptr]) \n\t"

- "sw $zero, 64(%[outptr]) \n\t"

- "sw $zero, 96(%[outptr]) \n\t"

- "sw $zero, 128(%[outptr]) \n\t"

- "sw $zero, 160(%[outptr]) \n\t"

- "sw $zero, 192(%[outptr]) \n\t"

- "sw $zero, 224(%[outptr]) \n\t"

- "sw $zero, 256(%[outptr]) \n\t"

- "sw $zero, 288(%[outptr]) \n\t"

- "sw $zero, 320(%[outptr]) \n\t"

- "sw $zero, 352(%[outptr]) \n\t"

- "sw $zero, 384(%[outptr]) \n\t"

- "sw $zero, 416(%[outptr]) \n\t"

- "sw $zero, 448(%[outptr]) \n\t"

- "sw $zero, 480(%[outptr]) \n\t"

- :

- : [outptr] "r" (outptr)

- );

- outptr += 2;

- }

- // Then transform columns

- idct16_cols_add_blk_dspr2(out, dest, dest_stride);

-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,

- int dest_stride) {

- uint32_t pos = 45;

- int32_t out;

- int32_t r;

- int32_t a1, absa1;

- int32_t vector_a1;

- int32_t t1, t2, t3, t4;

- int32_t vector_1, vector_2, vector_3, vector_4;

- /* bit positon for extract from acc */

- __asm__ __volatile__ (

- "wrdsp %[pos], 1 \n\t"

- :

- : [pos] "r" (pos)

- );

- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

- __asm__ __volatile__ (

- "addi %[out], %[out], 32 \n\t"

- "sra %[a1], %[out], 6 \n\t"

- : [out] "+r" (out), [a1] "=r" (a1)

- :

- );

- if (a1 < 0) {

- /* use quad-byte

- * input and output memory are four byte aligned */

- __asm__ __volatile__ (

- "abs %[absa1], %[a1] \n\t"

- "replv.qb %[vector_a1], %[absa1] \n\t"

- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

- : [a1] "r" (a1)

- );

- for (r = 16; r--;) {

- __asm__ __volatile__ (

- "lw %[t1], 0(%[dest]) \n\t"

- "lw %[t2], 4(%[dest]) \n\t"

- "lw %[t3], 8(%[dest]) \n\t"

- "lw %[t4], 12(%[dest]) \n\t"

- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

- "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

- "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

- "sw %[vector_1], 0(%[dest]) \n\t"

- "sw %[vector_2], 4(%[dest]) \n\t"

- "sw %[vector_3], 8(%[dest]) \n\t"

- "sw %[vector_4], 12(%[dest]) \n\t"

- "add %[dest], %[dest], %[dest_stride] \n\t"

- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

- [dest] "+&r" (dest)

- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

- );

- }

- } else {

- /* use quad-byte

- * input and output memory are four byte aligned */

- __asm__ __volatile__ (

- "replv.qb %[vector_a1], %[a1] \n\t"

- : [vector_a1] "=r" (vector_a1)

- : [a1] "r" (a1)

- );

- for (r = 16; r--;) {

- __asm__ __volatile__ (

- "lw %[t1], 0(%[dest]) \n\t"

- "lw %[t2], 4(%[dest]) \n\t"

- "lw %[t3], 8(%[dest]) \n\t"

- "lw %[t4], 12(%[dest]) \n\t"

- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

- "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

- "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

- "sw %[vector_1], 0(%[dest]) \n\t"

- "sw %[vector_2], 4(%[dest]) \n\t"

- "sw %[vector_3], 8(%[dest]) \n\t"

- "sw %[vector_4], 12(%[dest]) \n\t"

- "add %[dest], %[dest], %[dest_stride] \n\t"

- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

- [dest] "+&r" (dest)

- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

- );

- }

-#endif // #if HAVE_DSPR2

+#endif // HAVE_DSPR2

« no previous file with comments | « source/libvpx/vpx_dsp/mips/inv_txfm_msa.h ('k') | source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c » ('j') | no next file with comments »