| Index: source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
|
| ===================================================================
|
| --- source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c (revision 232232)
|
| +++ source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c (working copy)
|
| @@ -11,45 +11,47 @@
|
| #include "./vp9_rtcd.h"
|
| #include "vp9/common/vp9_common.h"
|
|
|
| -extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
|
| - int16_t *output,
|
| - int output_stride);
|
| -extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
|
| - int16_t *output,
|
| - int16_t *pass1Output,
|
| - int16_t skip_adding,
|
| - uint8_t *dest,
|
| - int dest_stride);
|
| -extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
|
| - int16_t *output,
|
| - int output_stride);
|
| -extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
|
| - int16_t *output,
|
| - int16_t *pass1Output,
|
| - int16_t skip_adding,
|
| - uint8_t *dest,
|
| - int dest_stride);
|
| -extern void save_neon_registers();
|
| -extern void restore_neon_registers();
|
| +void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
|
| + int16_t *output,
|
| + int output_stride);
|
| +void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
|
| + int16_t *output,
|
| + int16_t *pass1Output,
|
| + int16_t skip_adding,
|
| + uint8_t *dest,
|
| + int dest_stride);
|
| +void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
|
| + int16_t *output,
|
| + int output_stride);
|
| +void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
|
| + int16_t *output,
|
| + int16_t *pass1Output,
|
| + int16_t skip_adding,
|
| + uint8_t *dest,
|
| + int dest_stride);
|
|
|
| +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
|
| +extern void vp9_push_neon(int64_t *store);
|
| +extern void vp9_pop_neon(int64_t *store);
|
|
|
| -void vp9_short_idct16x16_add_neon(int16_t *input,
|
| - uint8_t *dest, int dest_stride) {
|
| +void vp9_idct16x16_256_add_neon(const int16_t *input,
|
| + uint8_t *dest, int dest_stride) {
|
| + int64_t store_reg[8];
|
| int16_t pass1_output[16*16] = {0};
|
| int16_t row_idct_output[16*16] = {0};
|
|
|
| // save d8-d15 register values.
|
| - save_neon_registers();
|
| + vp9_push_neon(store_reg);
|
|
|
| /* Parallel idct on the upper 8 rows */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7
|
| // which will be saved into row_idct_output.
|
| - vp9_short_idct16x16_add_neon_pass2(input+1,
|
| + vp9_idct16x16_256_add_neon_pass2(input+1,
|
| row_idct_output,
|
| pass1_output,
|
| 0,
|
| @@ -59,12 +61,12 @@
|
| /* Parallel idct on the lower 8 rows */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7
|
| // which will be saved into row_idct_output.
|
| - vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
|
| + vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
|
| row_idct_output+8,
|
| pass1_output,
|
| 0,
|
| @@ -74,12 +76,12 @@
|
| /* Parallel idct on the left 8 columns */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7.
|
| // Then add the result to the destination data.
|
| - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
|
| + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
|
| row_idct_output,
|
| pass1_output,
|
| 1,
|
| @@ -89,12 +91,12 @@
|
| /* Parallel idct on the right 8 columns */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7.
|
| // Then add the result to the destination data.
|
| - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
|
| + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
|
| row_idct_output+8,
|
| pass1_output,
|
| 1,
|
| @@ -102,28 +104,29 @@
|
| dest_stride);
|
|
|
| // restore d8-d15 register values.
|
| - restore_neon_registers();
|
| + vp9_pop_neon(store_reg);
|
|
|
| return;
|
| }
|
|
|
| -void vp9_short_idct10_16x16_add_neon(int16_t *input,
|
| - uint8_t *dest, int dest_stride) {
|
| +void vp9_idct16x16_10_add_neon(const int16_t *input,
|
| + uint8_t *dest, int dest_stride) {
|
| + int64_t store_reg[8];
|
| int16_t pass1_output[16*16] = {0};
|
| int16_t row_idct_output[16*16] = {0};
|
|
|
| // save d8-d15 register values.
|
| - save_neon_registers();
|
| + vp9_push_neon(store_reg);
|
|
|
| /* Parallel idct on the upper 8 rows */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
|
| + vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7
|
| // which will be saved into row_idct_output.
|
| - vp9_short_idct10_16x16_add_neon_pass2(input+1,
|
| + vp9_idct16x16_10_add_neon_pass2(input+1,
|
| row_idct_output,
|
| pass1_output,
|
| 0,
|
| @@ -135,12 +138,12 @@
|
| /* Parallel idct on the left 8 columns */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7.
|
| // Then add the result to the destination data.
|
| - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
|
| + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
|
| row_idct_output,
|
| pass1_output,
|
| 1,
|
| @@ -150,12 +153,12 @@
|
| /* Parallel idct on the right 8 columns */
|
| // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
|
| // stage 6 result in pass1_output.
|
| - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
|
| + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
|
|
|
| // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
|
| // with result in pass1(pass1_output) to calculate final result in stage 7.
|
| // Then add the result to the destination data.
|
| - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
|
| + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
|
| row_idct_output+8,
|
| pass1_output,
|
| 1,
|
| @@ -163,7 +166,7 @@
|
| dest_stride);
|
|
|
| // restore d8-d15 register values.
|
| - restore_neon_registers();
|
| + vp9_pop_neon(store_reg);
|
|
|
| return;
|
| }
|
|
|