Index: source/libvpx/vpx_dsp/mips/itrans16_dspr2.c |
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c |
similarity index 95% |
copy from source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c |
copy to source/libvpx/vpx_dsp/mips/itrans16_dspr2.c |
index e7ca39369b3962ce1584509b2ceb4476f92600bd..6d41e6190b78c032023aa851efc9c65c71c7c8f1 100644 |
--- a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c |
+++ b/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c |
@@ -1,5 +1,5 @@ |
/* |
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
* |
* Use of this source code is governed by a BSD-style license |
* that can be found in the LICENSE file in the root of the source |
@@ -8,21 +8,14 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
-#include <assert.h> |
-#include <stdio.h> |
- |
#include "./vpx_config.h" |
-#include "./vp9_rtcd.h" |
-#include "vp9/common/vp9_common.h" |
-#include "vp9/common/vp9_blockd.h" |
-#include "vp9/common/vp9_idct.h" |
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" |
+#include "./vpx_dsp_rtcd.h" |
+#include "vpx_dsp/mips/inv_txfm_dspr2.h" |
#include "vpx_dsp/txfm_common.h" |
-#include "vpx_ports/mem.h" |
#if HAVE_DSPR2 |
-static void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
- uint32_t no_rows) { |
+void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
+ uint32_t no_rows) { |
int i; |
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
int step1_10, step1_11, step1_12, step1_13; |
@@ -406,8 +399,8 @@ static void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
} |
} |
-static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
- int dest_stride) { |
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
+ int dest_stride) { |
int i; |
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
int step1_8, step1_9, step1_10, step1_11; |
@@ -894,7 +887,7 @@ static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
} |
} |
-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
+void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
int dest_stride) { |
DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
uint32_t pos = 45; |
@@ -913,7 +906,153 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
} |
-static void iadst16(const int16_t *input, int16_t *output) { |
+void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, |
+ int dest_stride) { |
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
+ int16_t *outptr = out; |
+ uint32_t i; |
+ uint32_t pos = 45; |
+ |
+ /* bit positon for extract from acc */ |
+ __asm__ __volatile__ ( |
+ "wrdsp %[pos], 1 \n\t" |
+ : |
+ : [pos] "r" (pos) |
+ ); |
+ |
+ // First transform rows. Since all non-zero dct coefficients are in |
+ // upper-left 4x4 area, we only need to calculate first 4 rows here. |
+ idct16_rows_dspr2(input, outptr, 4); |
+ |
+ outptr += 4; |
+ for (i = 0; i < 6; ++i) { |
+ __asm__ __volatile__ ( |
+ "sw $zero, 0(%[outptr]) \n\t" |
+ "sw $zero, 32(%[outptr]) \n\t" |
+ "sw $zero, 64(%[outptr]) \n\t" |
+ "sw $zero, 96(%[outptr]) \n\t" |
+ "sw $zero, 128(%[outptr]) \n\t" |
+ "sw $zero, 160(%[outptr]) \n\t" |
+ "sw $zero, 192(%[outptr]) \n\t" |
+ "sw $zero, 224(%[outptr]) \n\t" |
+ "sw $zero, 256(%[outptr]) \n\t" |
+ "sw $zero, 288(%[outptr]) \n\t" |
+ "sw $zero, 320(%[outptr]) \n\t" |
+ "sw $zero, 352(%[outptr]) \n\t" |
+ "sw $zero, 384(%[outptr]) \n\t" |
+ "sw $zero, 416(%[outptr]) \n\t" |
+ "sw $zero, 448(%[outptr]) \n\t" |
+ "sw $zero, 480(%[outptr]) \n\t" |
+ |
+ : |
+ : [outptr] "r" (outptr) |
+ ); |
+ |
+ outptr += 2; |
+ } |
+ |
+ // Then transform columns |
+ idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
+} |
+ |
+void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, |
+ int dest_stride) { |
+ uint32_t pos = 45; |
+ int32_t out; |
+ int32_t r; |
+ int32_t a1, absa1; |
+ int32_t vector_a1; |
+ int32_t t1, t2, t3, t4; |
+ int32_t vector_1, vector_2, vector_3, vector_4; |
+ |
+ /* bit positon for extract from acc */ |
+ __asm__ __volatile__ ( |
+ "wrdsp %[pos], 1 \n\t" |
+ |
+ : |
+ : [pos] "r" (pos) |
+ ); |
+ |
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); |
+ __asm__ __volatile__ ( |
+ "addi %[out], %[out], 32 \n\t" |
+ "sra %[a1], %[out], 6 \n\t" |
+ |
+ : [out] "+r" (out), [a1] "=r" (a1) |
+ : |
+ ); |
+ |
+ if (a1 < 0) { |
+ /* use quad-byte |
+ * input and output memory are four byte aligned */ |
+ __asm__ __volatile__ ( |
+ "abs %[absa1], %[a1] \n\t" |
+ "replv.qb %[vector_a1], %[absa1] \n\t" |
+ |
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) |
+ : [a1] "r" (a1) |
+ ); |
+ |
+ for (r = 16; r--;) { |
+ __asm__ __volatile__ ( |
+ "lw %[t1], 0(%[dest]) \n\t" |
+ "lw %[t2], 4(%[dest]) \n\t" |
+ "lw %[t3], 8(%[dest]) \n\t" |
+ "lw %[t4], 12(%[dest]) \n\t" |
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
+ "sw %[vector_1], 0(%[dest]) \n\t" |
+ "sw %[vector_2], 4(%[dest]) \n\t" |
+ "sw %[vector_3], 8(%[dest]) \n\t" |
+ "sw %[vector_4], 12(%[dest]) \n\t" |
+ "add %[dest], %[dest], %[dest_stride] \n\t" |
+ |
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
+ [dest] "+&r" (dest) |
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
+ ); |
+ } |
+ } else { |
+ /* use quad-byte |
+ * input and output memory are four byte aligned */ |
+ __asm__ __volatile__ ( |
+ "replv.qb %[vector_a1], %[a1] \n\t" |
+ |
+ : [vector_a1] "=r" (vector_a1) |
+ : [a1] "r" (a1) |
+ ); |
+ |
+ for (r = 16; r--;) { |
+ __asm__ __volatile__ ( |
+ "lw %[t1], 0(%[dest]) \n\t" |
+ "lw %[t2], 4(%[dest]) \n\t" |
+ "lw %[t3], 8(%[dest]) \n\t" |
+ "lw %[t4], 12(%[dest]) \n\t" |
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
+ "sw %[vector_1], 0(%[dest]) \n\t" |
+ "sw %[vector_2], 4(%[dest]) \n\t" |
+ "sw %[vector_3], 8(%[dest]) \n\t" |
+ "sw %[vector_4], 12(%[dest]) \n\t" |
+ "add %[dest], %[dest], %[dest_stride] \n\t" |
+ |
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
+ [dest] "+&r" (dest) |
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
+ ); |
+ } |
+ } |
+} |
+ |
+void iadst16_dspr2(const int16_t *input, int16_t *output) { |
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
int x0 = input[15]; |
@@ -1084,234 +1223,5 @@ static void iadst16(const int16_t *input, int16_t *output) { |
output[15] = -x1; |
} |
-void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
- int pitch, int tx_type) { |
- int i, j; |
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
- int16_t *outptr = out; |
- int16_t temp_out[16]; |
- uint32_t pos = 45; |
- |
- /* bit positon for extract from acc */ |
- __asm__ __volatile__ ( |
- "wrdsp %[pos], 1 \n\t" |
- : |
- : [pos] "r" (pos) |
- ); |
- |
- switch (tx_type) { |
- case DCT_DCT: // DCT in both horizontal and vertical |
- idct16_rows_dspr2(input, outptr, 16); |
- idct16_cols_add_blk_dspr2(out, dest, pitch); |
- break; |
- case ADST_DCT: // ADST in vertical, DCT in horizontal |
- idct16_rows_dspr2(input, outptr, 16); |
- |
- outptr = out; |
- |
- for (i = 0; i < 16; ++i) { |
- iadst16(outptr, temp_out); |
- |
- for (j = 0; j < 16; ++j) |
- dest[j * pitch + i] = |
- clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
- + dest[j * pitch + i]); |
- outptr += 16; |
- } |
- break; |
- case DCT_ADST: // DCT in vertical, ADST in horizontal |
- { |
- int16_t temp_in[16 * 16]; |
- |
- for (i = 0; i < 16; ++i) { |
- /* prefetch row */ |
- prefetch_load((const uint8_t *)(input + 16)); |
- |
- iadst16(input, outptr); |
- input += 16; |
- outptr += 16; |
- } |
- |
- for (i = 0; i < 16; ++i) |
- for (j = 0; j < 16; ++j) |
- temp_in[j * 16 + i] = out[i * 16 + j]; |
- |
- idct16_cols_add_blk_dspr2(temp_in, dest, pitch); |
- } |
- break; |
- case ADST_ADST: // ADST in both directions |
- { |
- int16_t temp_in[16]; |
- |
- for (i = 0; i < 16; ++i) { |
- /* prefetch row */ |
- prefetch_load((const uint8_t *)(input + 16)); |
- |
- iadst16(input, outptr); |
- input += 16; |
- outptr += 16; |
- } |
- |
- for (i = 0; i < 16; ++i) { |
- for (j = 0; j < 16; ++j) |
- temp_in[j] = out[j * 16 + i]; |
- iadst16(temp_in, temp_out); |
- for (j = 0; j < 16; ++j) |
- dest[j * pitch + i] = |
- clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
- + dest[j * pitch + i]); |
- } |
- } |
- break; |
- default: |
- printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); |
- break; |
- } |
-} |
- |
-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, |
- int dest_stride) { |
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
- int16_t *outptr = out; |
- uint32_t i; |
- uint32_t pos = 45; |
- |
- /* bit positon for extract from acc */ |
- __asm__ __volatile__ ( |
- "wrdsp %[pos], 1 \n\t" |
- : |
- : [pos] "r" (pos) |
- ); |
- |
- // First transform rows. Since all non-zero dct coefficients are in |
- // upper-left 4x4 area, we only need to calculate first 4 rows here. |
- idct16_rows_dspr2(input, outptr, 4); |
- |
- outptr += 4; |
- for (i = 0; i < 6; ++i) { |
- __asm__ __volatile__ ( |
- "sw $zero, 0(%[outptr]) \n\t" |
- "sw $zero, 32(%[outptr]) \n\t" |
- "sw $zero, 64(%[outptr]) \n\t" |
- "sw $zero, 96(%[outptr]) \n\t" |
- "sw $zero, 128(%[outptr]) \n\t" |
- "sw $zero, 160(%[outptr]) \n\t" |
- "sw $zero, 192(%[outptr]) \n\t" |
- "sw $zero, 224(%[outptr]) \n\t" |
- "sw $zero, 256(%[outptr]) \n\t" |
- "sw $zero, 288(%[outptr]) \n\t" |
- "sw $zero, 320(%[outptr]) \n\t" |
- "sw $zero, 352(%[outptr]) \n\t" |
- "sw $zero, 384(%[outptr]) \n\t" |
- "sw $zero, 416(%[outptr]) \n\t" |
- "sw $zero, 448(%[outptr]) \n\t" |
- "sw $zero, 480(%[outptr]) \n\t" |
- |
- : |
- : [outptr] "r" (outptr) |
- ); |
- |
- outptr += 2; |
- } |
- |
- // Then transform columns |
- idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
-} |
- |
-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, |
- int dest_stride) { |
- uint32_t pos = 45; |
- int32_t out; |
- int32_t r; |
- int32_t a1, absa1; |
- int32_t vector_a1; |
- int32_t t1, t2, t3, t4; |
- int32_t vector_1, vector_2, vector_3, vector_4; |
- |
- /* bit positon for extract from acc */ |
- __asm__ __volatile__ ( |
- "wrdsp %[pos], 1 \n\t" |
- |
- : |
- : [pos] "r" (pos) |
- ); |
- |
- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); |
- __asm__ __volatile__ ( |
- "addi %[out], %[out], 32 \n\t" |
- "sra %[a1], %[out], 6 \n\t" |
- |
- : [out] "+r" (out), [a1] "=r" (a1) |
- : |
- ); |
- |
- if (a1 < 0) { |
- /* use quad-byte |
- * input and output memory are four byte aligned */ |
- __asm__ __volatile__ ( |
- "abs %[absa1], %[a1] \n\t" |
- "replv.qb %[vector_a1], %[absa1] \n\t" |
- |
- : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) |
- : [a1] "r" (a1) |
- ); |
- |
- for (r = 16; r--;) { |
- __asm__ __volatile__ ( |
- "lw %[t1], 0(%[dest]) \n\t" |
- "lw %[t2], 4(%[dest]) \n\t" |
- "lw %[t3], 8(%[dest]) \n\t" |
- "lw %[t4], 12(%[dest]) \n\t" |
- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
- "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
- "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
- "sw %[vector_1], 0(%[dest]) \n\t" |
- "sw %[vector_2], 4(%[dest]) \n\t" |
- "sw %[vector_3], 8(%[dest]) \n\t" |
- "sw %[vector_4], 12(%[dest]) \n\t" |
- "add %[dest], %[dest], %[dest_stride] \n\t" |
- |
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
- [dest] "+&r" (dest) |
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
- ); |
- } |
- } else { |
- /* use quad-byte |
- * input and output memory are four byte aligned */ |
- __asm__ __volatile__ ( |
- "replv.qb %[vector_a1], %[a1] \n\t" |
- |
- : [vector_a1] "=r" (vector_a1) |
- : [a1] "r" (a1) |
- ); |
- |
- for (r = 16; r--;) { |
- __asm__ __volatile__ ( |
- "lw %[t1], 0(%[dest]) \n\t" |
- "lw %[t2], 4(%[dest]) \n\t" |
- "lw %[t3], 8(%[dest]) \n\t" |
- "lw %[t4], 12(%[dest]) \n\t" |
- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
- "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
- "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
- "sw %[vector_1], 0(%[dest]) \n\t" |
- "sw %[vector_2], 4(%[dest]) \n\t" |
- "sw %[vector_3], 8(%[dest]) \n\t" |
- "sw %[vector_4], 12(%[dest]) \n\t" |
- "add %[dest], %[dest], %[dest_stride] \n\t" |
- : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
- [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
- [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
- [dest] "+&r" (dest) |
- : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
- ); |
- } |
- } |
-} |
-#endif // #if HAVE_DSPR2 |
+#endif // HAVE_DSPR2 |